Projects

import warnings
warnings.filterwarnings('ignore',category=FutureWarning)
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
import tensorflow as tf
import os
import time

Using TensorFlow backend.

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames: 
        print(os.path.join(dirname, filename))

/kaggle/input/am-en-mt-dataset/am-en-both.pkl
/kaggle/input/am-en-mt-dataset/am-en-test.pkl
/kaggle/input/am-en-mt-dataset/am-en-train.pkl

# load a clean dataset
def load_clean_sentences(filename):
	return load(open('/kaggle/input/am-en-mt-dataset/'+filename, 'rb'))

# fit a tokenizer
def create_tokenizer(data):
	tokenizer = Tokenizer(filters='')
	tokenizer.fit_on_texts(data)
	return tokenizer

# max sentence length
def max_length(lines):
	return max(len(line.split()) for line in lines)

# load datasets
dataset = load_clean_sentences('am-en-both.pkl')
train = load_clean_sentences('am-en-train.pkl')
test = load_clean_sentences('am-en-test.pkl')
# prepare amharic tokenizer
am_tokenizer = create_tokenizer(dataset[:, 0].tolist())
am_vocab_size = len(am_tokenizer.word_index) + 1
am_length = max_length(dataset[:, 0])
print('Amharic Vocabulary Size: %d' % am_vocab_size)
print('Amharic Max Length: %d' % (am_length))
# prepare english tokenizer
en_tokenizer = create_tokenizer(dataset[:, 1].tolist())
en_vocab_size = len(en_tokenizer.word_index) + 1
en_length = max_length(dataset[:, 1])
print('English Vocabulary Size: %d' % en_vocab_size)
print('English Max Length: %d' % (en_length))

Amharic Vocabulary Size: 72705
Amharic Max Length: 56
English Vocabulary Size: 15881
English Max Length: 98

def prepare_seq(data,lang_tokenizer):
    lang_tokenizer.fit_on_texts(data)
    tensor = lang_tokenizer.texts_to_sequences(data)
    tensor = pad_sequences(tensor,padding='post')
    return tensor

am_train=train[:, 0].tolist()
en_train=train[:, 1].tolist()
am_test=test[:, 0].tolist()
en_test=test[:, 1].tolist()
# # prepare training data
trainX=prepare_seq(am_train, am_tokenizer)
trainY=prepare_seq(en_train, en_tokenizer)
# prepare validation data
testX = prepare_seq(am_test,am_tokenizer)
testY = prepare_seq(en_test, en_tokenizer)

print(len(trainX),len(trainY),len(testX),len(testY))

25000 25000 5000 5000

def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))
print ("Input Language; index to word mapping")
convert(am_tokenizer, trainX[0])
print ()
print ("Target Language; index to word mapping")
convert(en_tokenizer, trainY[0])

Input Language; index to word mapping
----> <start>
----> ፯
----> ፲፭
----> ፃድቁንም
----> መለስ
----> በነበርንበት
----> ዙሪያ
----> ላይ
----> እንድትድኑ
----> ፤
----> አኪቃም
----> የነበረውን
----> ለመመዝገብ
----> ።
----> <end>

Target Language; index to word mapping
----> <start>
----> 7
----> so
----> they
----> silas
----> bound
----> on
----> the
----> reigned
----> that
----> he
----> had
----> prepared
----> for
----> crossed
----> ,
----> and
----> the
----> fire
----> secret
----> na·thana·el
----> .
----> <end>

BUFFER_SIZE = len(trainX)
BATCH_SIZE = 64
steps_per_epoch = len(trainX)//BATCH_SIZE
embedding_dim = 100
units = 1024

dataset = tf.data.Dataset.from_tensor_slices((trainX, trainY)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

x_batch, y_batch = next(iter(dataset))
x_batch.shape, y_batch.shape

(TensorShape([64, 56]), TensorShape([64, 98]))

class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

encoder = Encoder(am_vocab_size, embedding_dim, units, BATCH_SIZE)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(x_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 56, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)

class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    # hidden shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # we are doing this to perform addition to calculate the score
    hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)
print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 56, 1)

class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

decoder = Decoder(en_vocab_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 15881)

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([en_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.9657
Epoch 1 Batch 100 Loss 1.8474
Epoch 1 Batch 200 Loss 1.5982
Epoch 1 Batch 300 Loss 1.6906
Epoch 1 Loss 1.7903
Time taken for 1 epoch 446.7812497615814 sec

Epoch 2 Batch 0 Loss 1.6074
Epoch 2 Batch 100 Loss 1.5171
Epoch 2 Batch 200 Loss 1.5066
Epoch 2 Batch 300 Loss 1.4055
Epoch 2 Loss 1.5082
Time taken for 1 epoch 317.6664056777954 sec

Epoch 3 Batch 0 Loss 1.3750
Epoch 3 Batch 100 Loss 1.2758
Epoch 3 Batch 200 Loss 1.3768
Epoch 3 Batch 300 Loss 1.2888
Epoch 3 Loss 1.3280
Time taken for 1 epoch 318.0388603210449 sec

Epoch 4 Batch 0 Loss 1.0995
Epoch 4 Batch 100 Loss 1.1701
Epoch 4 Batch 200 Loss 1.3564
Epoch 4 Batch 300 Loss 1.2092
Epoch 4 Loss 1.2142
Time taken for 1 epoch 318.29970240592957 sec

Epoch 5 Batch 0 Loss 1.2254
Epoch 5 Batch 100 Loss 1.1753
Epoch 5 Batch 200 Loss 1.1432
Epoch 5 Batch 300 Loss 1.0978
Epoch 5 Loss 1.1250
Time taken for 1 epoch 317.6780345439911 sec

Epoch 6 Batch 0 Loss 0.9184
Epoch 6 Batch 100 Loss 0.9374
Epoch 6 Batch 200 Loss 1.2070
Epoch 6 Batch 300 Loss 1.0207
Epoch 6 Loss 1.0468
Time taken for 1 epoch 318.3552486896515 sec

Epoch 7 Batch 0 Loss 0.9537
Epoch 7 Batch 100 Loss 0.9725
Epoch 7 Batch 200 Loss 1.0686
Epoch 7 Batch 300 Loss 1.0371
Epoch 7 Loss 0.9777
Time taken for 1 epoch 317.72064185142517 sec

Epoch 8 Batch 0 Loss 0.9688
Epoch 8 Batch 100 Loss 0.8350
Epoch 8 Batch 200 Loss 0.8334
Epoch 8 Batch 300 Loss 1.0320
Epoch 8 Loss 0.9131
Time taken for 1 epoch 318.37996006011963 sec

Epoch 9 Batch 0 Loss 0.7715
Epoch 9 Batch 100 Loss 0.8917
Epoch 9 Batch 200 Loss 0.8646
Epoch 9 Batch 300 Loss 0.8058
Epoch 9 Loss 0.8515
Time taken for 1 epoch 317.5151901245117 sec

Epoch 10 Batch 0 Loss 0.7025
Epoch 10 Batch 100 Loss 0.7091
Epoch 10 Batch 200 Loss 0.7524
Epoch 10 Batch 300 Loss 0.8210
Epoch 10 Loss 0.7932
Time taken for 1 epoch 318.22093963623047 sec

import warnings
warnings.filterwarnings('ignore',category=FutureWarning)

###preprocess
import re
import os
import numpy as np
def normalize_char_level_missmatch(input_token):
    rep1=re.sub('[ሃኅኃሐሓኻ]','ሀ',input_token)
    rep2=re.sub('[ሑኁዅ]','ሁ',rep1)
    rep3=re.sub('[ኂሒኺ]','ሂ',rep2)
    rep4=re.sub('[ኌሔዄ]','ሄ',rep3)
    rep5=re.sub('[ሕኅ]','ህ',rep4)
    rep6=re.sub('[ኆሖኾ]','ሆ',rep5)
    rep7=re.sub('[ሠ]','ሰ',rep6)
    rep8=re.sub('[ሡ]','ሱ',rep7)
    rep9=re.sub('[ሢ]','ሲ',rep8)
    rep10=re.sub('[ሣ]','ሳ',rep9)
    rep11=re.sub('[ሤ]','ሴ',rep10)
    rep12=re.sub('[ሥ]','ስ',rep11)
    rep13=re.sub('[ሦ]','ሶ',rep12)
    rep14=re.sub('[ዓኣዐ]','አ',rep13)
    rep15=re.sub('[ዑ]','ኡ',rep14)
    rep16=re.sub('[ዒ]','ኢ',rep15)
    rep17=re.sub('[ዔ]','ኤ',rep16)
    rep18=re.sub('[ዕ]','እ',rep17)
    rep19=re.sub('[ዖ]','ኦ',rep18)
    rep20=re.sub('[ጸ]','ፀ',rep19)
    rep21=re.sub('[ጹ]','ፁ',rep20)
    rep22=re.sub('[ጺ]','ፂ',rep21)
    rep23=re.sub('[ጻ]','ፃ',rep22)
    rep24=re.sub('[ጼ]','ፄ',rep23)
    rep25=re.sub('[ጽ]','ፅ',rep24)
    rep26=re.sub('[ጾ]','ፆ',rep25)
    #Normalizing words with Labialized Amharic characters such as በልቱዋል or  በልቱአል to  በልቷል  
    rep27=re.sub('(ሉ[ዋአ])','ሏ',rep26)
    rep28=re.sub('(ሙ[ዋአ])','ሟ',rep27)
    rep29=re.sub('(ቱ[ዋአ])','ቷ',rep28)
    rep30=re.sub('(ሩ[ዋአ])','ሯ',rep29)
    rep31=re.sub('(ሱ[ዋአ])','ሷ',rep30)
    rep32=re.sub('(ሹ[ዋአ])','ሿ',rep31)
    rep33=re.sub('(ቁ[ዋአ])','ቋ',rep32)
    rep34=re.sub('(ቡ[ዋአ])','ቧ',rep33)
    rep35=re.sub('(ቹ[ዋአ])','ቿ',rep34)
    rep36=re.sub('(ሁ[ዋአ])','ኋ',rep35)
    rep37=re.sub('(ኑ[ዋአ])','ኗ',rep36)
    rep38=re.sub('(ኙ[ዋአ])','ኟ',rep37)
    rep39=re.sub('(ኩ[ዋአ])','ኳ',rep38)
    rep40=re.sub('(ዙ[ዋአ])','ዟ',rep39)
    rep41=re.sub('(ጉ[ዋአ])','ጓ',rep40)
    rep42=re.sub('(ደ[ዋአ])','ዷ',rep41)
    rep43=re.sub('(ጡ[ዋአ])','ጧ',rep42)
    rep44=re.sub('(ጩ[ዋአ])','ጯ',rep43)
    rep45=re.sub('(ጹ[ዋአ])','ጿ',rep44)
    rep46=re.sub('(ፉ[ዋአ])','ፏ',rep45)
    rep47=re.sub('[ቊ]','ቁ',rep46) #ቁ can be written as ቊ
    rep48=re.sub('[ኵ]','ኩ',rep47) #ኩ can be also written as ኵ  

    return rep48

#remove all ascii characters and punctuation marks
def remove_ascii(text_input):
    rm_num_and_ascii=re.sub('[A-Za-z]','',remove_punc_and_special_chars(text_input))
    return rm_num_and_ascii

def arabic2geez(arabicNumber):
    ETHIOPIC_ONE= 0x1369
    ETHIOPIC_TEN= 0x1372
    ETHIOPIC_HUNDRED= 0x137B
    ETHIOPIC_TEN_THOUSAND = 0x137C
    arabicNumber=str(arabicNumber)
    n = len(arabicNumber)-1 #length of arabic number
    if n%2 == 0:
        arabicNumber = "0" + arabicNumber
        n+=1
    arabicBigrams=[arabicNumber[i:i+2] for i in range(0,n,2)] #spliting bigrams
    reversedArabic=arabicBigrams[::-1] #reversing list content
    geez=[]
    for index,pair in enumerate(reversedArabic):
        curr_geez=''
        artens=pair[0]#arrabic tens
        arones=pair[1]#arrabic ones
        amtens=''
        amones=''
        if artens!='0':
            amtens=str(chr((int(artens) + (ETHIOPIC_TEN - 1)))) #replacing with Geez 10s [፲,፳,፴, ...]
        else:
            if arones=='0': #for 00 cases
                continue
        if arones!='0':       
                amones=str(chr((int(arones) + (ETHIOPIC_ONE - 1)))) #replacing with Geez Ones [፩,፪,፫, ...]
        if index>0:
            if index%2!= 0: #odd index
                curr_geez=amtens+amones+ str(chr(ETHIOPIC_HUNDRED)) #appending ፻
            else: #even index
                curr_geez=amtens+amones+ str(chr(ETHIOPIC_TEN_THOUSAND)) # appending ፼
        else: #last bigram (right most part)
            curr_geez=amtens+amones

        geez.append(curr_geez)

    geez=''.join(geez[::-1])
    if geez.startswith('፩፻') or geez.startswith('፩፼'):
        geez=geez[1:]

    if len(arabicNumber)>=7:
        end_zeros=''.join(re.findall('([0]+)$',arabicNumber)[0:])
        i=int(len(end_zeros)/3)
        if len(end_zeros)>=(3*i):
            if i>=3:
                i-=1
            for thoushand in range(i-1):
                print(thoushand)                
                geez+='፼'

    return geez
def normalize_number(number):
    if '.' not in str(number):
        return arabic2geez(number)
    else:
        num,decimal=str(number).split('.')
        if decimal.startswith('0'):
            decimal=decimal[1:]
            dot=' ነጥብ ዜሮ '
        else:
            dot=' ነጥብ '
        return arabic2geez(num)+dot+self.arabic2geez(decimal)

def is_number(word):
        try:
            float(word)
            return True
        except ValueError:
             return False
     
        try:
            import unicodedata
            unicodedata.numeric(word)
            return True
        except (TypeError, ValueError):
             return False
     
        return False   
    
def anynumber(string):
    return any(i.isdigit() for i in string) 


def preprocess_sentence(w):
    w = re.sub(r"([።?.፣!,“”፤;:])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub('[\►\@\#\$\%\^\«\»\&\*\(\)\…\[\]\{\}›\’\‘\"\'\ʹ\‹\/\\\\|\`\´\~\-\=\+\፦\፥\፧\፨\፠]', '',w)
    w = normalize_char_level_missmatch(w)
    w = w.rstrip().strip()
#     adding a start and an end token to the sentence
#     so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

def evaluate(sentence):
  attention_plot = np.zeros((en_length, am_length))

  sentence = preprocess_sentence(sentence)

  inputs = [am_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                         maxlen=am_length,
                                                         padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([en_tokenizer.word_index['<start>']], 0)

  for t in range(en_length):
    predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)

    # storing the attention weights to plot later on
    attention_weights = tf.reshape(attention_weights, (-1, ))
    attention_plot[t] = attention_weights.numpy()

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += en_tokenizer.index_word[predicted_id] + ' '

    if en_tokenizer.index_word[predicted_id] == '<end>':
      return result, sentence, attention_plot

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence, attention_plot

# function for plotting the attention weights
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
def plot_attention(attention, sentence, predicted_sentence):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot(1, 1, 1)
  ax.matshow(attention, cmap='viridis')

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  plt.show()

def translate(sentence):
  result, sentence, attention_plot = evaluate(sentence)

  print('Input: %s' % (sentence))
  print('Predicted translation: {}'.format(result))

  attention_plot = attention_plot[:len(result.split(' ')), :len(sentence.split(' '))]
  plot_attention(attention_plot, sentence.split(' '), result.split(' '))

# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fdddd039c50>

sen3="ይህ ሰው ግን ስለ ኃጢአት ለሁልጊዜ የሚሆን አንድ መሥዋዕት አቅርቦ በአምላክ ቀኝ ተቀምጧል"
translate(sen3)

Input: <start> ይህ ሰው ግን ስለ ሀጢአት ለሁልጊዜ የሚሆን አንድ መስዋእት አቅርቦ በአምላክ ቀኝ ተቀምጧል <end>
Predicted translation: does but if it was not oil in the lord , <end> 

png

#Bad translation: further exploration

including word embeddings
contextual embeddings
data coverage and preprocessing