# This algorithm will recode the expressions contained in an Excel file 
# @author <a href="mailto:wendeline.swart@irit.fr">Wendeline Swart</a>
# @author <a href="mailto:guillaume.cabanac@irit.fr">Guillaume Cabanac</a>
# @author <a href="mailto:Cecile.Crespy@ut-capitole.fr">Cécile Crespy</a>
# @since 15-NOV-2023
# @version 15-NOV-2023 Code structure


# Import all the libraries we need
import pandas as pd
import re
import io

# Retrieving the Excel file
excelFile = r"C:\Users\underlined_words.xlsx"

# Select the text containing all the reports
text = io.open('corpus.txt', 'r', encoding='utf-8').read()

# Sorts words in descending order of length
data = pd.read_excel(excelFile)
data = data[["word", "à remplacer par (valeurs)"]]
sortData = data["word"].str.len().sort_values().index # here we sort character strings in ascending order of length
data = data.reindex(sortData)[::-1] # here we re-index our data so that it is in reverse order

# Replacement of strings in the first column with their corresponding strings in the second column
for e in data.values:
    # we need to do the replacement, taking into account the whole word and paying attention to the parentheses, which are not supported by regex
    print("\\b"+str(e[0]).strip().replace("(","\\(").replace(")","\\)")+"\\b -> "+str(e[1]).strip())
    text = re.sub("\\b"+str(e[0]).strip().replace("(","\\(").replace(")","\\)")+"\\b", str(e[1]).strip(), text, flags=re.IGNORECASE)
print(text)

# Saving the modified text in a new file
fichierIramuteq = open("fichierIramuteq.txt", mode="a", encoding="UTF-8")
fichierIramuteq.write(text)
fichierIramuteq.close()