#!/usr/bin/python

import numpy as np
from node import Node
from node import ParseTree

'''

TODOs:

   - fix comment explaining CYK algorithm
   - post on github
   - add probabilistic support (?)

Dependencies:
   
   - python2.7 -m pip install numpy
   - sudo apt-get install graphviz

'''

'''
Grammar representation notes

We will represent a grammar through a dictionary. In particular, given
a non terminal symbol of the grammar NT, g[NT] contains a list of
elements. The elements can either be list themselves, representing
concatenations of non-terminal symbols, or simple strings,
representing terminal symbols.

grammar = {}
grammar["S"] = [["A", "B"], ["B", "C"]]
grammar["A"] = [["B", "C"], "a"]
grammar["B"] = [["B", "B"], ["C", "C"], "b"]
grammar["C"] = ["a"]

'''


"""
CYK algorithm implementation notes

Assume that the grammar is in CNF. This allows us to make the
following assumptions:

1) We don't need to check if we can match an entire substring with a
   single non-terminal because the grammar is in CNF.

Returns an upper diagonal matrix that contains information about how a
particular substring of the input string is parsed. In particular, the
matrix can

   cyk_matrix[i][j] := {(NT, (i_1, j_1, NT1), (i_2, j_2, NT2)) |
                           NT1 is in cyk_matrix[i_1][j_1], NT2 is
                           in cyk_matrix[i_2][j_2], in the input
                           grammar there is the rule NT -> NT1
                           NT2}, for i < j

   cyk_matrix[i][i] := {(NT)}

For implementation purpose the matrix will contain dictionaries. In
particular, for each cell of the matrix there will be a corresponding
dictionary which will contain, as keys, all non-terminals that can be
written in that cell, as well as a "pos" value. The "pos" key will
have a value (i, j) indicating the position of the dictionary inside
the matrix. The values associated with each non terminal symbol
instead will be a list of tuples, where each tuple is defined as
mentioned above.

"""

# -------------------------
        
class CYKParser:
    
    def __init__(self, grammar=None, inputString=None):
        # the chart used for parsing
        self.parse_table = None 
        self.grammar = grammar
        self.inputString = inputString

        """'

        This function is used to fill a new cell of the parse table by
        using two previously filled cells. The cells are represented
        as dictionaries. 
        
        :param left_cell: dictionary representing the left_cell.
        :param right_cell: dictionary representing the right_cell.
        :param new_cell: dictionary representing the new cell to be filled.

        """
    def __fill_parse_table_cell(self, new_cell, left_cell, right_cell):
        i, j = new_cell["pos"]
        
        if i == j:
            # simple case: the new cell is in the main diagonal.
            # 
            # in this case we only have to check if the are
            # non-terminals that match directly to our terminal of
            # interest, which is the i-th (or j-th) character of the
            # input string.
            for key in left_cell:
                if key != "pos":
                    rhss_to_match = key
        else:

            left_nts = left_cell.keys()[:]
            right_nts = right_cell.keys()[:]
            left_nts.remove("pos"), right_nts.remove("pos")
            
            # compute the cartesian product between the non-terminals
            # of left_cell and the non-terminals of right_cell. 
            rhss_to_match = []
            for left_nt in left_nts:
                for right_nt in right_nts:
                    rhss_to_match.append([left_nt, right_nt])

        # iterate the grammar and find all the production in the
        # grammar with the matching RHS (Right-Hand-Side)
        for nt in self.grammar:
            for rhs in rhss_to_match:
                if rhs in self.grammar[nt]:
                    if i == j:
                        new_cell[nt] = rhs
                    else:
                        nt1 = rhs[0]
                        i_1, j_1 = left_cell['pos']

                        nt2 = rhs[1]
                        i_2, j_2 = right_cell['pos']

                        if nt not in new_cell.keys():
                            # In order to build a parse tree we also
                            # have to add information about the
                            # non-terminals used to derive to the new
                            # non-terminal and their respective
                            # positions in the parse table.
                            new_cell[nt] = [((nt1, i_1, j_1), (nt2, i_2, j_2))]
                        else:
                            # Save different ways of getting the same
                            # non-terminal in order to build all
                            # possible parse trees for a given sentence.
                            new_cell[nt] = new_cell[nt] + [((nt1, i_1, j_1), (nt2, i_2, j_2))]

        return 

    # parse the input string using the given CNF grammar.
    def parse(self):
        if self.grammar == None or self.inputString == None:
            # NOTE: careful, self.parse_table is still None!
            return
        
        n = len(self.inputString)
        self.parse_table = np.zeros((n, n), dtype=object)
            
        # Init step: fill the main diagonal
        for k in range(0, n):
            newCell = {"pos": (k, k)}
            self.__fill_parse_table_cell(newCell, {self.inputString[k]: [], "pos": (-1, -1)}, {"pos": (-1, -1)})
            self.parse_table[k][k] = newCell

        # Parsing step: fill the other diagonals
        for k in range(1, n):
            for j in range(k, n):
                # fill k-th upper diagonal
                newCell = {"pos": (j-k, j)}
                for i in range(0, k):            
                    self.__fill_parse_table_cell(newCell, self.parse_table[j-k][i+j-k], self.parse_table[i+1+j-k][j])
                self.parse_table[j-k][j] = newCell
                
        return

    def __get_first_parse_tree_rec(self, nt, i, j):
        if i == j:
            node = Node(nt)
            leaf = Node(self.parse_table[i][j][nt])
            node.left = leaf
            return node
        else:
            # consider only first possible way of transforming nt to
            # get the sub-string associated with (i, j)
            (nt1, i_1, j_1), (nt2, i_2, j_2) = self.parse_table[i][j][nt][0]
            
            node = Node(nt)
            node.left = self.__get_first_parse_tree_rec(nt1, i_1, j_1)
            node.right = self.__get_first_parse_tree_rec(nt2, i_2, j_2)
            return node

    def getFirstParseTree(self):
        if self.parse_table is None:
            self.parse()

        if self.parse_table is None:
            # cannot parse yet
            return None
        elif "S" not in self.parse_table[0][len(self.inputString) - 1].keys():
            # string is not generated by the grammar
            return None
        else:
            root = self.__get_first_parse_tree_rec("S", 0, len(self.inputString) -1)
            return ParseTree(root)

    def __get_all_parse_trees_rec(self, nt, i, j):
        if i == j:
            node = Node(nt)
            leaf = Node(self.parse_table[i][j][nt])
            node.left = leaf
            return [node]
        else:
            res = []

            # for every possible way of expanding nt in
            #
            #           nt -> nt_left nt_right
            # 
            for value in self.parse_table[i][j][nt]:
                (nt_left, i_left, j_left), (nt_right, i_right, j_right) = value
            
                # count number of subtrees with root nt_left that
                # cover the string associated with (i_left, j_left)
                # and do the same for nt_right.
                left_sub_trees = self.__get_all_parse_trees_rec(nt_left, i_left, j_left)
                right_sub_trees = self.__get_all_parse_trees_rec(nt_right, i_right, j_right)

                # then combine each possible combination of subtree
                # rooted at nt_left and subtree rooted at nt_right to
                # get tree rooted at nt that covers the string
                # associated with (i, j).
                for i in range(0, len(left_sub_trees)):
                    for j in range(0, len(right_sub_trees)):
                        node = Node(nt)
                        node.left = left_sub_trees[i]
                        node.right = right_sub_trees[j]
                        res.append(node)

            # build the parse trees object for every tree created
            if nt == "S":
               trees = []
               for root in res:
                   trees.append(ParseTree(root))
               return trees
            else:
                return res

    # return all possible parse trees for the given string and the
    # given CNF grammar.
    #
    # TODO: test this function
    def getAllParseTrees(self):
        if self.parse_table is None:
            self.parse()

        if self.parse_table is None:
            # cannot parse yet
            return None
        elif "S" not in self.parse_table[0][len(self.inputString) - 1].keys():
            # string not generated by the grammar
            return None
        else:
            return self.__get_all_parse_trees_rec("S", 0, len(self.inputString) - 1)

    # returns the number of valid parse trees that have root the non
    # terminal nt and that parse the sub-string associated to cell (i,
    # j) in the parse_table.
    def __count_parse_trees_rec(nt, i, j):
        if i == j:
            return 1
        else:
            total_count = 0

            # let C(nt) be the number of valid parse trees that parse
            # the sub-string associated with (i, j). Then, it can be
            # proved that
            #
            #   C(nt) = \sum_{nt -> nt_left nt_right} C(nt_left) * C(nt_right)
            # 
            # NOTE: to be formally correct, extra care should be put
            # to specify the indexes. 
            for value in self.parse_table[i][j][nt]:
                (nt_left, i_left, j_left), (nt_right, i_right, j_right) = value

                left_count = self.__count_parse_trees_rec(nt_left, i_left, j_left)
                right_count = self.__count_parse_trees_rec(nt_right, i_right, j_right)

                total_count += left_count * right_count
                
            return total_count
        
    # returns the number of valid parse trees that can be constructed
    # using the given CNF grammar to parse the given string.
    #
    # TODO: test this function
    def countParseTrees(self):
        if self.parse_table is None:
            self.parse()

        if self.parse_table is None:
            return -1
        elif "S" not in self.parse_table[0][len(self.inputString) - 1].keys():
            return 0
        else:
            return self.__count_parse_trees_rec("S", 0, len(self.inputString) - 1)

    # TODO: test this 
    def readGrammarFromFile(filename):
        self.grammar = {}
        with open(filename, 'r') as f:
            for line in f:
                symbols = line.split(" -> ")
                prod_lh = symbols[0].strip()
                prod_rh = symbols[1].strip().split(" ")

                # found a rule of the form NT -> T
                # NOTE: normalize it using lower case        
                if len(prod_rh) == 1:
                    prod_rh = prod_rh[0].lower()
                elif len(prod_rh) > 2:
                    print("Error, grammar not in CNF form: " + prod_rh)
                    return "-1"

                # add rule prod_rh -> prod_lf to the grammar
                if prod_lh not in self.grammar:
                    self.grammar[prod_lh] = [prod_rh]
                else:
                    self.grammar[prod_lh].append(prod_rh)

        return

    def readSentence(string):
        self.inputString = string

# -------------- Ends classes definition area --------------        

# -------------- Code production area --------------

def read_sentences(filename):
    sentences = []
    with open(filename, 'r') as f:
        line = f.readline()
        while line:
            s = line.split("\t")
            for i in range(0, len(s)):
                s[i] = s[i].strip().lower()
            sentences.append(s)
            line = f.readline()
    return sentences

# Computes some statistics for a given grammar, such as:
#
# - number of binary rules  (A -> BC)
# - number of lexical rules (A -> a)
# - number of non-terminals
# - number of terminals
#
def compute_grammar_statistics(grammar):
    binary_rules = 0
    lexical_rules = 0
    terminals_found = set()

    for nt in g1:
        for lhs in g1[nt]:
            if isinstance(lhs, list):
                binary_rules += 1
            else:
                terminals_found.add(lhs)
                lexical_rules += 1

    return (binary_rules, lexical_rules, terminals_found)

# -------------- Code testing area --------------

# --------- test 1

grammar = {}
grammar["S"] = [["A", "B"], ["B", "C"]]
grammar["A"] = [["B", "C"], "a"]
grammar["B"] = [["B", "B"], ["C", "C"], "b"]
grammar["C"] = ["a"]

string_1 = "baaa"
string_3 = "aabaaa"

CYK = CYKParser(grammar, string_1)
CYK.parse()
t = CYK.getFirstParseTree()
t.labelTree()
t.writeTreeDotfile("tests/test.dot")

# --------- test 2
grammar = {}
grammar["S"] = [["A", "B"], ["A", "C"]]
grammar["A"] = [["C", "D"], ["D", "B"]]
grammar["B"] = ["b", "c"]
grammar["C"] = ["a", "c"]
grammar["D"] = ["b"]

string = "abc"
CYK = CYKParser(grammar, string)
CYK.parse()
trees = CYK.getAllParseTrees()

for i in range(0, len(trees)):
    tree = trees[i]
    tree.labelTree()
    tree.writeTreeDotfile("tests/prova" + str(i) + ".dot")