#!/usr/bin/env python3
# Code taken from lecture 12 of IR course done on 06/12/19

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import os
import numpy as np
import pandas as pd
# import pickle


# If you don't want to index the entire dataset, you can add only a
# specific subfolder using some of the following values: alt.atheism,
# comp.graphics, comp.windows.x, misc.forsale.
# DATASET = "20_newsgroups/comp.graphics"
DATASET = "20_newsgroups"

paths = []
for (dirpath, dirnames, filenames) in os.walk(str(os.getcwd())+'/'+DATASET+'/'):
    for i in filenames:
        paths.append(str(dirpath)+str("/")+i)

# -----------------------------------
# Pre-processing chain
# -----------------------------------


def remove_header(data):
    try:
        ind = data.index('\n\n')
        data = data[ind:]
    except Exception:
        print("No Header")
    return data


def convert_lower_case(data):
    return np.char.lower(data)


def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words:
            new_text = new_text + " " + w
    return np.char.strip(new_text)


def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data


def remove_apostrophe(data):
    return np.char.replace(data, "'", "")


def remove_single_characters(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + " " + w
    return np.char.strip(new_text)


def convert_numbers(data):
    data = np.char.replace(data, "0", " zero ")
    data = np.char.replace(data, "1", " one ")
    data = np.char.replace(data, "2", " two ")
    data = np.char.replace(data, "3", " three ")
    data = np.char.replace(data, "4", " four ")
    data = np.char.replace(data, "5", " five ")
    data = np.char.replace(data, "6", " six ")
    data = np.char.replace(data, "7", " seven ")
    data = np.char.replace(data, "8", " eight ")
    data = np.char.replace(data, "9", " nine ")
    return data


def stemming(data):
    stemmer = PorterStemmer()
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return np.char.strip(new_text)


def preprocess(data, query):
    if not query:
        data = remove_header(data)
    data = convert_lower_case(data)
    data = convert_numbers(data)
    data = remove_punctuation(data)  # remove comma seperately
    data = remove_stop_words(data)
    data = remove_apostrophe(data)
    data = remove_single_characters(data)
    data = stemming(data)
    return data

# -----------------------------------
# Inverted Index Construction
# -----------------------------------


doc = 0
postings = pd.DataFrame()

for path in paths[0:50]:
    file = open(path, 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    preprocessed_text = preprocess(text, False)
    tokens = word_tokenize(str(preprocessed_text))
    for token in tokens:
        if token in postings:
            p = postings[token][0]
            p.add(doc)
            postings[token][0] = p
        else:
            postings.insert(value=[{doc}], loc=0, column=token)
    doc += 1

# NOTE: use picke to save it on local disk
#
# to write: postings.to_pickle(DATASET + "_unigram_postings")
# to read: postings = pd.read_pickle(title + "_unigram_postings")


def get_posting(word):
    if word in postings:
        return postings[word][0]
    else:
        return []


def print_word_postings(word):
    preprocessed_word = str(preprocess(word, True))
    print(preprocessed_word)
    postings = get_posting(preprocessed_word)
    print("Document Frequency:", len(postings))
    print("Postings List:", postings)


# -----------------------------------
# Query language implementation
# -----------------------------------


def get_not(word):
    a = get_posting(word)
    b = set(range(len(paths)))
    return b.difference(a)


def generate_command_tokens(query):
    query = query.lower()
    tokens = word_tokenize(query)

    command_tokens = []
    query_tokens = []

    for t in tokens:
        if t not in ['and', 'or', 'not']:
            processed_token = preprocess([t], True)
            query_tokens.append(str(processed_token))
        else:
            command_tokens.append(t)

    return command_tokens, query_tokens


def generate_not_tuple(query_tokens, command_tokens):
    tup = set()
    # as long as there are NOT command to execute
    while 'not' in command_tokens:
        i = command_tokens.index('not')
        word = query_tokens[i]
        word_postings = get_not(word)  # get result of NOT <word>
        tup.update(word_postings)
        command_tokens.pop(i)  # remove command
        # Replace the word with an ID to signal that it has been
        # processed
        #
        # NOTE: numbers have been already removed in the preprocessing
        # stage.
        query_tokens[i] = i
        print("\nAfter Not Processing:", command_tokens, query_tokens)
    return tup


def binary_operations(query_tokens, command_tokens, tup):
    if not query_tokens[0]:
        return tup

    a = get_posting(query_tokens[0])
    query_tokens.pop(0)

    for i in range(len(command_tokens)):
        if type(query_tokens[i]) == int:
            b = tup
        else:
            b = get_posting(query_tokens[i])

        if command_tokens[i] == 'and':
            a = a.intersection(b)
        elif command_tokens[i] == 'or':
            a = a.union(b)
        else:
            print("Invalid Command")

    return a


def execute_query(query):

    command_tokens, query_tokens = generate_command_tokens(query)
    tup = generate_not_tuple(query_tokens, command_tokens)

    print("-----------------------------------------")
    print("\nCommand Tokens:", command_tokens)
    print("\nQuery Tokens:", query_tokens)
    print("\nNot Tup:", len(tup))

    final_set = binary_operations(query_tokens, command_tokens, tup)
    final_set = sorted(final_set)

    print("\nFinal Set:", final_set)
    print("-----------------------------------------")
    return final_set


execute_query("place")
execute_query("welcome")
execute_query("authority")
execute_query("place and not authority")
execute_query("place or authority")
execute_query("place or authority and not welcome")