from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle


title = "20_newsgroups"
paths = []
for (dirpath, dirnames, filenames) in os.walk(str(os.getcwd()) + '/' + title + '/comp.graphics'):
    for i in filenames:
        paths.append(str(dirpath) + str("/") + i)


# -----------------------------------
# Pre-processing chain
# -----------------------------------


def remove_header(data):
    try:
        ind = data.index('\n\n')
        data = data[ind:]
    except Exception:
        print("No Header")
    return data


def convert_lower_case(data):
    return np.char.lower(data)


def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words:
            new_text = new_text + " " + w
    return np.char.strip(new_text)


def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data


def remove_apostrophe(data):
    return np.char.replace(data, "'", "")


def remove_single_characters(data):
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if len(w) > 1:
            new_text = new_text + " " + w
    return np.char.strip(new_text)


def convert_numbers(data):
    data = np.char.replace(data, "0", " zero ")
    data = np.char.replace(data, "1", " one ")
    data = np.char.replace(data, "2", " two ")
    data = np.char.replace(data, "3", " three ")
    data = np.char.replace(data, "4", " four ")
    data = np.char.replace(data, "5", " five ")
    data = np.char.replace(data, "6", " six ")
    data = np.char.replace(data, "7", " seven ")
    data = np.char.replace(data, "8", " eight ")
    data = np.char.replace(data, "9", " nine ")
    return data


def stemming(data):
    stemmer = PorterStemmer()

    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return np.char.strip(new_text)


def preprocess(data, query):
    if not query:
        data = remove_header(data)
    data = convert_lower_case(data)
    data = convert_numbers(data)
    data = remove_punctuation(data)  # remove comma seperately
    data = remove_apostrophe(data)
    data = remove_single_characters(data)
    data = stemming(data)
    return data

# -----------------------------------
# Inverted Index Construction
# -----------------------------------


postings = pd.DataFrame()
frequency = pd.DataFrame()
doc = 0
for path in paths:
    file = open(path, 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    preprocessed_text = preprocess(text, False)
    tokens = word_tokenize(str(preprocessed_text))
    pos = 0
    for token in tokens:
        if token in postings:
            p = postings[token][0
            k = [a[0] for a in p]
            if doc in k:
                for a in p:
                    if a[0] == doc:
                        a[1].add(pos)
            else:
                p.append([doc,{pos}])
                frequency[token][0] += 1
        else:
            postings.insert(value=[[[doc, {pos}]]], loc=0, column=token)
            frequency.insert(value=[1], loc=0, column=token)
        pos += 1
    doc += 1


# -----------------------------------
# Query language implementation
# -----------------------------------


def get_word_postings(word):
    preprocessed_word = str(preprocess(word, True))
    print(preprocessed_word)
    print("Document Frequency:",frequency[preprocessed_word][0])
    print("Postings List:",postings[preprocessed_word][0])


def get_positions(posting_values, doc):
    for posting_value in posting_values:
        if posting_value[0] == doc:
            return posting_value[1]
    return {}


def get_posting(word):
    if word in postings:
        return postings[word][0]
    else:
        return []


def gen_init_set_matchings(word):
    init = []
    word_postings = get_posting(word)
    for word_posting in word_postings:
        for positions in word_posting[1]:
            init.append((word_posting[0], positions))
    return init


def match_positional_index(init, query_tokens):
    matched_docs = []
    for p in init:
        doc = p[0]
        pos = p[1]
        count = 0
        for query_token in query_tokens:
            pos = pos+1
            query_token_pos = get_posting(query_token)
            docs_list = [z[0] for z in query_token_pos]
            if doc in docs_list:
                doc_positions = get_positions(query_token_pos, doc)
                print("CANDIDATE DOC\t", doc, "\t", doc_positions)
                if pos in doc_positions:
                    count += 1
                else:
                    count += 1
                    break
            if count == len(query_tokens):
                matched_docs.append(p[0])
    return set(matched_docs)


def run_query(query):
    processed_query = preprocess(query, True)
    print(processed_query)

    query_tokens = word_tokenize(str(processed_query))
    print("Query tokens:\t", query_tokens)

    if len(query_tokens)==1:
        query_postings = get_posting(query_tokens[0])
        result = [a[0] for a in query_postings]
        print("Total Document Mathces:\t", result)
        return result

    init_word = query_tokens[0]
    init_matches = gen_init_set_matchings(init_word)
    print("Initial Matches:\t", init_matches)

    query_tokens.pop(0)
    total_matched_docs = match_positional_index(init_matches, query_tokens)
    #Not very efficient
    total_matched_docs = sorted(total_matched_docs)

    print("Total Document Matches:", total_matched_docs)
    return total_matched_docs