#https://numpy.org/doc/stable/user/absolute_beginners.html #https://docs.python.org/3/tutorial/introduction.html#lists from sklearn.tree import DecisionTreeClassifier from sklearn.tree import export_text from sklearn import tree from sklearn import preprocessing from sklearn import utils import random from matplotlib import image from matplotlib import pyplot from numpy import asarray import numpy as np import csv # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # PREPARING THE DATA FOR TRAINING AND TESTING # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #function for reading csv file and returning a list of rows as dictionaries def input_filename_csv_return_dict(filename): csv_as_list = None #define csv reader with open(filename, newline='') as csvfile: reader = csv.reader(csvfile, delimiter='|', quotechar='|' ) csv_as_list=list(reader) #define dictionary list_of_dicts = [] for row in csv_as_list: empty_dict = {} empty_dict['surah_number'] = row[0] empty_dict['verse_number'] = row[1] empty_dict['verse_text'] = row[2] list_of_dicts.append(empty_dict) return list_of_dicts # +------------------------------------------+ # | surah_number | verse_number | verse_text | # +------------------------------------------+ #open csv file filename='English-Literal-125utf.csv' csv_as_dict=input_filename_csv_return_dict(filename) #predict next word # go thru each verse text # split each verse text into words # pair of words, before and after # predict next word #python set of unique words unique_words_set = set() for row in csv_as_dict: verse_text = row['verse_text'] words = verse_text.split() for word in words: #remove punctuation 10374 vs 13705 vs 13427 word = word.replace('.','') #word = word.replace(',','') word = word.replace('!','') word = word.replace('?','') word = word.replace(' ','') #word = word.replace('(','') #word = word.replace(')','') #word = word.replace(':','') #word = word.replace(';','') #word = word.replace('"','') #word = word.replace("'",'') #word = word.replace('-','') #word = word.replace('`','') unique_words_set.add(word) #convert set to list unique_words_list = list(unique_words_set) #add integer for each word unique_dict_words_lookup_i = {} unique_dict_words_lookup_text = {} for i in range(len(unique_words_list)): unique_dict_words_lookup_i[i] = unique_words_list[i] unique_dict_words_lookup_text[unique_words_list[i]] = i #SAMPLE OUTPUT OF unique_dict_words_lookup_text # { 13411, 'thinking/assuming': 13412, 'buried': 13413, 'than)': 13414, '-': 13415, 'worship(ping),': 13416, # 'completed/abundant': 13417, '(DISCREPANCY': 13418, 'leader/grantor"': 13419, '474': 13420, 'still': 13421, # 'loose': 13422, 'justice': 13423, 'fallen/dropped': 13424, 'foolish': 13425, 'debauchers`': 13426, 'talked': 13427} separator_integer = len(unique_words_list) #go thru verse text and replace words with integers #using unique_words_list_with_integers string_all_words = '' for row in csv_as_dict: verse_text = row['verse_text'] words = verse_text.split() for i in range(len(words)): word = words[i] lookup_word = word.replace('.','') lookup_word = lookup_word.replace('!','') lookup_word = lookup_word.replace('?','') lookup_word = lookup_word.replace(' ','') #check if word is in unique_words_list_with_integers if unique_dict_words_lookup_text.get(lookup_word) != None: #if find punctuation, add punctuation to string_all_words #end char will be integer 50000 if word != lookup_word: string_all_words += str(unique_dict_words_lookup_text[lookup_word]) + '|' + str( separator_integer ) + '|' else: string_all_words += str(unique_dict_words_lookup_text[lookup_word]) + '|' else: print('Could not find in unique_dict_words_lookup_text',word) # SAMPLE OUTPUT for string_all_words everything is an integer # 1|13116|13313|5353|13428|12451|7314|9748|9613|5939|827|13428|2473|10699|9613|10719|674|155| # 6919|3782|13428|5227|4037|3348|10604|1499|395|12171|9955|12171|7266|2861|6060|13428|2473| # 5015|8871|8270|12219|10719|10917|6225|2001|13428|6797|10000|8246|854|1961|854|10000|10719| print('done', separator_integer) X_train = [] Y_target = [] #go thru string_all_words and create X_train and y_train #create a sliding window of 2 words and store in X_train and y_train words = string_all_words.split('|') for i in range(len(words)-2): X_train.append(words[i]) Y_target.append(words[i+1]) #convert X_train and y_train to numpy array X_train_np = np.array(X_train).reshape(-1,1) Y_target_np = np.array(Y_target) #train 2 sets of words and predict next word X_train_2 = [] Y_target_2 = [] #go thru string_all_words and create X_train and y_train #create a sliding window of 2 words and store in X_train and y_train words = string_all_words.split('|') for i in range(len(words)-2): X_train_2.append( [ words[i], words[i+1] ] ) Y_target_2.append(words[i+2]) #convert X_train and y_train to numpy array X_train_np_2 = np.array(X_train_2) Y_target_np_2 = np.array(Y_target_2) #clf = tree.DecisionTreeClassifier(random_state=0, max_depth=128) #clf = tree.DecisionTreeClassifier( max_depth=512) clf = tree.DecisionTreeClassifier( ) clf = clf.fit(X_train_np, Y_target_np) random_number = random.randint(0,len(unique_words_list)) next_word_as_integer= random_number print(unique_dict_words_lookup_i[random_number], end=' ') for k in range(1, 20): next_word_as_integer= clf.predict([[int(next_word_as_integer) ]]) if int(next_word_as_integer) == separator_integer: print('.',end=' ') else: print(unique_dict_words_lookup_i[int(next_word_as_integer)], end=' ')