#https://numpy.org/doc/stable/user/absolute_beginners.html
#https://docs.python.org/3/tutorial/introduction.html#lists
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn import tree
from sklearn import preprocessing
from sklearn import utils
import random
from matplotlib import image
from matplotlib import pyplot
from numpy import asarray
import numpy as np
import csv


# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#               PREPARING THE DATA FOR TRAINING AND TESTING
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#function for reading csv file and returning a list of rows as dictionaries
def input_filename_csv_return_dict(filename):
    csv_as_list = None
    #define csv reader
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter='|', quotechar='|' )
        csv_as_list=list(reader)
    
    #define dictionary
    list_of_dicts = []
    for row in csv_as_list:
        empty_dict = {}
        empty_dict['surah_number'] = row[0]
        empty_dict['verse_number'] = row[1]
        empty_dict['verse_text'] = row[2]
        list_of_dicts.append(empty_dict)
    return list_of_dicts

#  +------------------------------------------+
#  | surah_number | verse_number | verse_text | 
#  +------------------------------------------+

#open csv file
filename='English-Literal-125utf.csv'
csv_as_dict=input_filename_csv_return_dict(filename)

#predict next word
# go thru each verse text
# split each verse text into words
# pair of words, before and after
# predict next word
#python set of unique words


unique_words_set = set()
for row in csv_as_dict:
    verse_text = row['verse_text']
    words = verse_text.split()
    for word in words:
        #remove punctuation 10374 vs 13705 vs 13427
        word = word.replace('.','')
        #word = word.replace(',','')
        word = word.replace('!','')
        word = word.replace('?','')
        word = word.replace(' ','')

        #word = word.replace('(','')
        #word = word.replace(')','')
        #word = word.replace(':','')
        #word = word.replace(';','')
        #word = word.replace('"','')
        #word = word.replace("'",'')
        #word = word.replace('-','')
        #word = word.replace('`','')
        
        unique_words_set.add(word)

#convert set to list
unique_words_list = list(unique_words_set)
#add integer for each word


unique_dict_words_lookup_i = {}
unique_dict_words_lookup_text = {}

for i in range(len(unique_words_list)):
    unique_dict_words_lookup_i[i] = unique_words_list[i]
    unique_dict_words_lookup_text[unique_words_list[i]] = i

#SAMPLE OUTPUT OF unique_dict_words_lookup_text
# { 13411, 'thinking/assuming': 13412, 'buried': 13413, 'than)': 13414, '-': 13415, 'worship(ping),': 13416, 
# 'completed/abundant': 13417, '(DISCREPANCY': 13418, 'leader/grantor"': 13419, '474': 13420, 'still': 13421, 
# 'loose': 13422, 'justice': 13423, 'fallen/dropped': 13424, 'foolish': 13425, 'debauchers`': 13426, 'talked': 13427}
separator_integer = len(unique_words_list)

#go thru verse text and replace words with integers
#using unique_words_list_with_integers
string_all_words = ''
for row in csv_as_dict:
    verse_text = row['verse_text']
    words = verse_text.split()
    for i in range(len(words)):
        word = words[i]
        lookup_word = word.replace('.','')
        lookup_word = lookup_word.replace('!','')
        lookup_word = lookup_word.replace('?','')
        lookup_word = lookup_word.replace(' ','')

        #check if word is in unique_words_list_with_integers
        if unique_dict_words_lookup_text.get(lookup_word) != None:
            #if find punctuation, add punctuation to string_all_words
            #end char will be integer 50000
            if word != lookup_word:
                string_all_words += str(unique_dict_words_lookup_text[lookup_word]) + '|' + str( separator_integer ) + '|'
            else:
                string_all_words += str(unique_dict_words_lookup_text[lookup_word]) + '|'
        else:
            print('Could not find in unique_dict_words_lookup_text',word)

# SAMPLE OUTPUT for string_all_words everything is an integer
# 1|13116|13313|5353|13428|12451|7314|9748|9613|5939|827|13428|2473|10699|9613|10719|674|155|
# 6919|3782|13428|5227|4037|3348|10604|1499|395|12171|9955|12171|7266|2861|6060|13428|2473|
# 5015|8871|8270|12219|10719|10917|6225|2001|13428|6797|10000|8246|854|1961|854|10000|10719|


print('done', separator_integer)

X_train = []
Y_target = []

#go thru string_all_words and create X_train and y_train
#create a sliding window of 2 words and store in X_train and y_train
words = string_all_words.split('|')
for i in range(len(words)-2):
    X_train.append(words[i])
    Y_target.append(words[i+1])

#convert X_train and y_train to numpy array
X_train_np = np.array(X_train).reshape(-1,1)
Y_target_np = np.array(Y_target)


#train 2 sets of words and predict next word
X_train_2 = []
Y_target_2 = []

#go thru string_all_words and create X_train and y_train
#create a sliding window of 2 words and store in X_train and y_train
words = string_all_words.split('|')
for i in range(len(words)-2):
    X_train_2.append( [ words[i], words[i+1] ] )
    Y_target_2.append(words[i+2])

#convert X_train and y_train to numpy array
X_train_np_2 = np.array(X_train_2)
Y_target_np_2 = np.array(Y_target_2)

#clf = tree.DecisionTreeClassifier(random_state=0, max_depth=128)
#clf = tree.DecisionTreeClassifier( max_depth=512)
clf = tree.DecisionTreeClassifier( )
clf = clf.fit(X_train_np, Y_target_np)
random_number = random.randint(0,len(unique_words_list))
next_word_as_integer= random_number
print(unique_dict_words_lookup_i[random_number], end=' ')

for k in range(1, 20):
    next_word_as_integer= clf.predict([[int(next_word_as_integer) ]])
    if int(next_word_as_integer)  == separator_integer:
        print('.',end=' ')
    else:
        print(unique_dict_words_lookup_i[int(next_word_as_integer)], end=' ')