# This algorithm extracts underlined expressions from Docx documents and writes them to an Excel file.
# @author <a href="mailto:wendeline.swart@irit.fr">Wendeline Swart</a>
# @author <a href="mailto:guillaume.cabanac@irit.fr">Guillaume Cabanac</a>
# @author <a href="mailto:Cecile.Crespy@ut-capitole.fr">Cécile Crespy</a>
# @since 15-NOV-2023
# @version 15-NOV-2023 Code structure


# Imports
from docx2python import docx2python
from glob import glob
from html.parser import HTMLParser
from IPython.display import clear_output
import xlsxwriter


# Class for parsing data in HTML format
class MyHTMLParser(HTMLParser):
    
    def __init__(self) -> None:
        super().__init__()
        self.__in_u:bool = False
        self.__words:list = list()
    
    def handle_starttag(self, tag, attrs) -> None:
        if tag == "u":
            self.__in_u = True

    def handle_endtag(self, tag) -> None:
        if tag == "u":
            self.__in_u = False

    def handle_data(self, data) -> None:
        if self.__in_u == True:
            self.__words.append(data)
            
    def get_words(self) -> list:
        return self.__words


# Extraction of underlined words (from all documents)
words = dict() # {word:str, documents:list}
data_path = glob("src/*.docm")
data_len = len(data_path)
for i in range(data_len):
    print("Extracting content from file " + str(i + 1) + " on " + str(data_len))
    content = docx2python(data_path[i], html=True).text.replace("\n", "<br>")
    content = "".join(content.split(str("</u><u>")))
    content = "".join(content.split(str("</u> <u>")))
    content = "".join(content.split(str("</u>  <u>")))
    parser = MyHTMLParser()
    parser.feed(content)
    words_list = parser.get_words()
    for e in words_list:
        temp_e = str(e).upper()
        temp_fname = data_path[i].split("\\")[-1]
        temp_fname = temp_fname.split(".")[0]
        if temp_e not in words.keys():
            words[temp_e] = [temp_fname]
        elif temp_fname not in words[temp_e]:
            words[temp_e].append(temp_fname)
    clear_output(wait=True)


# Storing results in an Excel file
workbook = xlsxwriter.Workbook("underlined_words.xlsx")
worksheet = workbook.add_worksheet()

bold = workbook.add_format({'bold': 1})
worksheet.write("A1", "word", bold)
worksheet.write("B1", "document(s)", bold)

data = ([k, ", ".join(v)] for k, v in words.items())
row = 1
col = 0
for item, value in (data):
    worksheet.write(row, col, item)
    worksheet.write(row, col + 1, value)
    row += 1
    
workbook.close()