#https://numpy.org/doc/stable/user/absolute_beginners.html
#https://docs.python.org/3/tutorial/introduction.html#lists
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn import tree
from sklearn import preprocessing
from sklearn import utils
import random
from matplotlib import image
from matplotlib import pyplot
from numpy import asarray
import numpy as np
import csv


# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#               PREPARING THE DATA FOR TRAINING AND TESTING
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#function for reading csv file and returning a list of rows as dictionaries
def input_filename_csv_return_string(filename,col):
    csv_as_list = None
    #define csv reader
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter='|', quotechar='|' )
        csv_as_list=list(reader)
    
    #define dictionary
    ret_string = ''
    for row in csv_as_list:
        ret_string = ret_string + row[col] + ' '
    return ret_string

#  +------------------------------------------+
#  | verse_text + verse_text                  | 
#  +------------------------------------------+

#open csv file
filename='English-Literal-125utf.csv'
file_text=input_filename_csv_return_string(filename,2)


#replace special characters
file_text = file_text.replace('.'," ").replace(',',' ').replace('!', ' ').replace('?',' ').replace('/',' ')  \
    .replace('(', ' ').replace(')',' ').replace(':',' ').replace(';',' ').replace('   ',' ').replace('  ',' ').lower()
#print(file_text)

#predict next word
# go thru each verse text
# split each verse text into words
# pair of words, before and after
# predict next word
#python set of unique words


unique_words_set = set()
words = file_text.split()
for word in words:
    unique_words_set.add(word)

#convert set to list
unique_words_list = list(unique_words_set)
#add integer for each word

unique_dict_words_lookup_i = {}
unique_dict_words_lookup_text = {}

for i in range(len(unique_words_list)):
    unique_dict_words_lookup_i[i] = unique_words_list[i]
    unique_dict_words_lookup_text[unique_words_list[i]] = i

print(len(unique_words_list))
#SAMPLE OUTPUT OF unique_dict_words_lookup_text
# { 13411, 'thinking/assuming': 13412, 'buried': 13413, 'than)': 13414, '-': 13415, 'worship(ping),': 13416, 
# 'completed/abundant': 13417, '(DISCREPANCY': 13418, 'leader/grantor"': 13419, '474': 13420, 'still': 13421, 
# 'loose': 13422, 'justice': 13423, 'fallen/dropped': 13424, 'foolish': 13425, 'debauchers`': 13426, 'talked': 13427}
#separator_integer = len(unique_words_list)

#go thru verse text and replace words with integers
#using unique_words_list_with_integers
string_all_words = ''
for word in words:
    string_all_words += str(unique_dict_words_lookup_text[word]) + '|'


# SAMPLE OUTPUT for string_all_words everything is an integer
# 1|13116|13313|5353|13428|12451|7314|9748|9613|5939|827|13428|2473|10699|9613|10719|674|155|
# 6919|3782|13428|5227|4037|3348|10604|1499|395|12171|9955|12171|7266|2861|6060|13428|2473|
# 5015|8871|8270|12219|10719|10917|6225|2001|13428|6797|10000|8246|854|1961|854|10000|10719|


X_train = []
Y_target = []

#go thru string_all_words and create X_train and y_train
#create a sliding window of 2 words and store in X_train and y_train
words = string_all_words.split('|')
for i in range(len(words)-2):
    X_train.append(words[i])
    Y_target.append(words[i+1])

#convert X_train and y_train to numpy array
X_train_np = np.array(X_train).reshape(-1,1)
Y_target_np = np.array(Y_target)


#train 2 sets of words and predict next word
X_train_2 = []
Y_target_2 = []

#go thru string_all_words and create X_train and y_train
#create a sliding window of 2 words and store in X_train and y_train
words = string_all_words.split('|')
for i in range(len(words)-3):
    X_train_2.append( [ words[i], words[i+1] ] )
    Y_target_2.append([ words[i+2], words[i+3] ] )

#convert X_train and y_train to numpy array
X_train_np_2 = np.array(X_train_2)
Y_target_np_2 = np.array(Y_target_2)

#clf = tree.DecisionTreeClassifier(random_state=0, max_depth=128)
#clf = tree.DecisionTreeClassifier( max_depth=512)
clf = tree.DecisionTreeClassifier( )
clf = clf.fit(X_train_np_2, Y_target_np_2) #17gigabytes of ram memory required
random_number_1 = random.randint(0,len(unique_words_list))
random_number_2 = random.randint(0,len(unique_words_list))

next_word_as_integer= random_number_1
print(unique_dict_words_lookup_i[random_number_1], unique_dict_words_lookup_i[random_number_2])

for k in range(1, 40):
    next_words_as_integers= clf.predict([[int(random_number_1) , int(random_number_2)]]) #returns numpy array
    next_word_list = np.ndarray.tolist(next_words_as_integers)
    random_number_1 = next_word_list[0][0]
    random_number_2 = next_word_list[0][1]
    print(unique_dict_words_lookup_i[int(random_number_1)], unique_dict_words_lookup_i[int(random_number_2)], end=' ')