#!/usr/bin/python3

"""
The point of this script is to build a vector space model for the the
reuters21578 dataset using basic tf-id weighing.

-------------------------------

   The Reuters-21578, Distribution 1.0 test collection is available
from David D. Lewis' professional home page, currently:
             http://www.research.att.com/~lewis

Besides this README file, the collection consists of 22 data files, an
SGML DTD file describing the data file format, and six files
describing the categories used to index the data.  (See Sections VI
and VII for more details.)  Some additional files, which are not part
of the collection but have been contributed by other researchers as
useful resources are also included.  All files are available
uncompressed, and in addition a single gzipped Unix tar archive of the
entire distribution is available as reuters21578.tar.gz.

   The text categorization mailing list, DDLBETA, is a good place to
send questions about this collection and other text categorization
issues. You may join the list by writing David Lewis at
lewis@research.att.com.

-------------------------------

reuters21578 Dataset categories:

              Number of    Number of Categories   Number of Categories
Category Set  Categories     w/ 1+ Occurrences      w/ 20+ Occurrences
************  **********   ********************   ********************
EXCHANGES        39                32                       7
ORGS             56                32                       9
PEOPLE          267               114                      15
PLACES          175               147                      60
TOPICS          135               120                      57
"""

"""
TODOs:
       - Do feature selection on words of dictionary
"""

from bs4 import BeautifulSoup
import traceback
import math

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np

# -- mine
import myutil

# ------------------------------
# Load dataset in python objects
# -----
DATASETS_PATH = "../../datasets/"

ORIG_REUTERS_DATASET_PATH = DATASETS_PATH + "/reuters21578_orig/"
REUTERS_DATASET_PATH = DATASETS_PATH + "/reuters21578/reuters21578_dataset.pickle"
REUTERS_DICTIONARY_PATH = DATASETS_PATH + "/reuters21578/reuters21578_dictionary.pickle"
 
class Document(object):
    def __init__(self, topics, title, date, body):
        self.topics = topics
        self.title = title
        self.date = date
        self.body = body
        self.vsm_tuple = []
        self.freqs = {}

    def get_tokens(self):
        return word_tokenize(str(pre_process(self.body)))

# ----

def dump_dataset():
    """Store original Reuters-21578 dataset as a list of python
    Document() objects.
    """
    tot_docs = []
    for i in range(0, 22):
        print(f"Processing file: {i}")
        filename = f"reut2-00{i}.sgm" if i < 10 else f"reut2-0{i}.sgm"
        tot_docs += read_docs(ORIG_REUTERS_DATASET_PATH + filename)

    create_dictionary(tot_docs)

def load_dataset():
    return myutil.load_pickle(REUTERS_DATASET_PATH)

# ----    

def read_docs(file_path):
    final_docs = []
    with open(file_path, "r") as infile:
        text = infile.read()
        soup = BeautifulSoup(text, 'lxml')
        docs = soup.find_all("reuters")
        for doc in docs:
            # -- get all topics
            topic_sets = ["topics", "places", "people", "orgs", "exchanges", "companies"]
            doc_topics = []
            
            for topic_set in topic_sets:
                doc_topics += [topic.decode_contents().strip()
                               for topic in doc.find(topic_set).find_all("d")]

            if doc_topics == []:
                # -- skip docs with no topics
                continue
            
            try:
                doc_body = ""
                doc_date = ""
                doc_title = ""
                doc_date = ""                
                if doc.find("dateline"):
                    # -- with lxml cannot find body tag directly (?)
                    doc_body = str(doc.find("dateline").next.next)
                    doc_date = doc.find("dateline").decode_contents().strip()

                if doc.find("title"):
                    doc_title = doc.find("title").decode_contents().strip()
                elif doc.find("text"):
                    doc_body = doc.find("text").decode_contents().strip()
                    
            except Exception as e:
                print(doc)
                traceback.print_exc()
                print(e)
                return
                
            final_docs.append(Document(doc_topics, doc_title, doc_date, doc_body))

    print(f"Total n. of docs: {len(final_docs)}")
    return final_docs


# -------------------
# Dictionary stuff
# -----

def create_dictionary(docs):
    """
    Creates a dictionary out of a bunch of documents. For each word the
    dictionary maintains the number of distinct docs in which that
    word appears.
    """
    dictionary = {}
    for i, doc in enumerate(docs):
        print(f"Processing doc: {i}")
        seen_in_doc = {}
        pre_processed_text = pre_process(doc.body)
        freqs = {} 
        for token in word_tokenize(str(pre_processed_text)):
            # -- update freqs for current doc
            freqs[token] = freqs[token] + 1 if token in freqs else 1

            # -- update dictionary
            if token not in dictionary:
                dictionary[token] = 1
            elif token not in seen_in_doc:
                # -- increment count only for first occurrence in
                # -- current doc
                dictionary[token] += 1
                seen_in_doc[token] = True
                
        # -- save freqs for vsm creation
        doc.freqs = freqs

    # -- write to disk
    myutil.store_pickle(dictionary, REUTERS_DICTIONARY_PATH)
    myutil.store_pickle(docs, REUTERS_DATASET_PATH)

def load_dictionary():
    return load_pickle(REUTERS_DICTIONARY_PATH)

def feature_selection(dictionary):
    # TODO: actually implement feature selection instead of this shit
    N = 100
    small_dict = {}
    for i, word in enumerate(dictionary):
        if i >= N:
            break
        small_dict[word] = dictionary[word]
    return small_dict

# -------------------
# Preprocessing chain
# -----

def convert_lower_case(data):
    return np.char.lower(data)

def convert_numbers(data):
    num2word = {
        "0": " zero ",
        "1": " one ",
        "2": " two ",
        "3": " three ",
        "4": " four ",
        "5": " five ",
        "6": " six ",
        "7": " seven ",
        "8": " eight ",
        "9": " nine ",
    }

    for i in range(0, 10):
        data = np.char.replace(data, str(i), num2word[str(i)])
    
    return data

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words:
            new_text = new_text + " " + w
    return np.char.strip(new_text)

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def remove_single_characters(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + " " + w
    return np.char.strip(new_text)

def stemming(data):
    stemmer = PorterStemmer()
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return np.char.strip(new_text)

def pre_process(data):
    data = convert_lower_case(data)
    data = convert_numbers(data)
    data = remove_punctuation(data)
    data = remove_stop_words(data)
    data = remove_apostrophe(data)
    data = remove_single_characters(data)
    data = stemming(data)
    return data

# -------------------
# Creation of VSM
# -----

def idf(word, tot_docs, dictionary):
    """
    Computes inverse-document-frequency (idf).
    """
    return math.log(tot_docs / dictionary[word])

def tf(word, doc):
    """
    Computes term-frequency (tf).
    """
    return 1 + math.log(doc.freqs[word]) if word in doc.freqs else 0

def create_vsm(docs, dictionary):
    """
    This function updates the contents of each docs to contain a tuple
    which is the representation of the doc in the vector space model
    we're building.
    """
    words = dictionary.keys()
    tot_docs = len(docs)
    
    for i, doc in enumerate(docs):
        print(f"VSM: Processing doc: {i}")
        doc_t = []
        for word in words:
            score = tf(word, doc) * idf(word, tot_docs, dictionary)
            doc_t.append(score)
        doc.vsm_tuple = doc_t

    # -- write to disk
    myutil.store_pickle(docs, REUTERS_DATASET_PATH)
        
# -------------------

if __name__ == "__main__":
    dump_dataset()
    # print("hello")
    # docs = load_dataset()    
    # basic_dic = load_dictionary()
    # cool_dic = feature_selection(basic_dic)
    # create_vsm(docs, cool_dic)