#https://numpy.org/doc/stable/user/absolute_beginners.html #https://docs.python.org/3/tutorial/introduction.html#lists from sklearn.tree import DecisionTreeClassifier from sklearn.tree import export_text from sklearn import tree from sklearn import preprocessing from sklearn import utils import random from matplotlib import image from matplotlib import pyplot from numpy import asarray import numpy as np import csv # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # PREPARING THE DATA FOR TRAINING AND TESTING # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #function for reading csv file and returning a list of rows as dictionaries def input_filename_csv_return_string(filename,col): csv_as_list = None #define csv reader with open(filename, newline='') as csvfile: reader = csv.reader(csvfile, delimiter='|', quotechar='|' ) csv_as_list=list(reader) #define dictionary ret_string = '' for row in csv_as_list: ret_string = ret_string + row[col] + ' ' return ret_string # +------------------------------------------+ # | verse_text + verse_text | # +------------------------------------------+ #open csv file filename='English-Literal-125utf.csv' file_text=input_filename_csv_return_string(filename,2) #replace special characters file_text = file_text.replace('.'," ").replace(',',' ').replace('!', ' ').replace('?',' ').replace('/',' ') \ .replace('(', ' ').replace(')',' ').replace(':',' ').replace(';',' ').replace(' ',' ').replace(' ',' ').lower() #print(file_text) #predict next word # go thru each verse text # split each verse text into words # pair of words, before and after # predict next word #python set of unique words unique_words_set = set() words = file_text.split() for word in words: unique_words_set.add(word) #convert set to list unique_words_list = list(unique_words_set) #add integer for each word unique_dict_words_lookup_i = {} unique_dict_words_lookup_text = {} for i in range(len(unique_words_list)): unique_dict_words_lookup_i[i] = unique_words_list[i] unique_dict_words_lookup_text[unique_words_list[i]] = i print(len(unique_words_list)) #SAMPLE OUTPUT OF unique_dict_words_lookup_text # { 13411, 'thinking/assuming': 13412, 'buried': 13413, 'than)': 13414, '-': 13415, 'worship(ping),': 13416, # 'completed/abundant': 13417, '(DISCREPANCY': 13418, 'leader/grantor"': 13419, '474': 13420, 'still': 13421, # 'loose': 13422, 'justice': 13423, 'fallen/dropped': 13424, 'foolish': 13425, 'debauchers`': 13426, 'talked': 13427} #separator_integer = len(unique_words_list) #go thru verse text and replace words with integers #using unique_words_list_with_integers string_all_words = '' for word in words: string_all_words += str(unique_dict_words_lookup_text[word]) + '|' # SAMPLE OUTPUT for string_all_words everything is an integer # 1|13116|13313|5353|13428|12451|7314|9748|9613|5939|827|13428|2473|10699|9613|10719|674|155| # 6919|3782|13428|5227|4037|3348|10604|1499|395|12171|9955|12171|7266|2861|6060|13428|2473| # 5015|8871|8270|12219|10719|10917|6225|2001|13428|6797|10000|8246|854|1961|854|10000|10719| X_train = [] Y_target = [] #go thru string_all_words and create X_train and y_train #create a sliding window of 2 words and store in X_train and y_train words = string_all_words.split('|') for i in range(len(words)-2): X_train.append(words[i]) Y_target.append(words[i+1]) #convert X_train and y_train to numpy array X_train_np = np.array(X_train).reshape(-1,1) Y_target_np = np.array(Y_target) #train 2 sets of words and predict next word X_train_2 = [] Y_target_2 = [] #go thru string_all_words and create X_train and y_train #create a sliding window of 2 words and store in X_train and y_train words = string_all_words.split('|') for i in range(len(words)-3): X_train_2.append( [ words[i], words[i+1] ] ) Y_target_2.append([ words[i+2], words[i+3] ] ) #convert X_train and y_train to numpy array X_train_np_2 = np.array(X_train_2) Y_target_np_2 = np.array(Y_target_2) #clf = tree.DecisionTreeClassifier(random_state=0, max_depth=128) #clf = tree.DecisionTreeClassifier( max_depth=512) clf = tree.DecisionTreeClassifier( ) clf = clf.fit(X_train_np_2, Y_target_np_2) #17gigabytes of ram memory required random_number_1 = random.randint(0,len(unique_words_list)) random_number_2 = random.randint(0,len(unique_words_list)) next_word_as_integer= random_number_1 print(unique_dict_words_lookup_i[random_number_1], unique_dict_words_lookup_i[random_number_2]) for k in range(1, 40): next_words_as_integers= clf.predict([[int(random_number_1) , int(random_number_2)]]) #returns numpy array next_word_list = np.ndarray.tolist(next_words_as_integers) random_number_1 = next_word_list[0][0] random_number_2 = next_word_list[0][1] print(unique_dict_words_lookup_i[int(random_number_1)], unique_dict_words_lookup_i[int(random_number_2)], end=' ')