#!/usr/bin/python import numpy as np from node import Node from node import ParseTree ''' TODOs: - fix comment explaining CYK algorithm - post on github - add probabilistic support (?) Dependencies: - python2.7 -m pip install numpy - sudo apt-get install graphviz ''' ''' Grammar representation notes We will represent a grammar through a dictionary. In particular, given a non terminal symbol of the grammar NT, g[NT] contains a list of elements. The elements can either be list themselves, representing concatenations of non-terminal symbols, or simple strings, representing terminal symbols. grammar = {} grammar["S"] = [["A", "B"], ["B", "C"]] grammar["A"] = [["B", "C"], "a"] grammar["B"] = [["B", "B"], ["C", "C"], "b"] grammar["C"] = ["a"] ''' """ CYK algorithm implementation notes Assume that the grammar is in CNF. This allows us to make the following assumptions: 1) We don't need to check if we can match an entire substring with a single non-terminal because the grammar is in CNF. Returns an upper diagonal matrix that contains information about how a particular substring of the input string is parsed. In particular, the matrix can cyk_matrix[i][j] := {(NT, (i_1, j_1, NT1), (i_2, j_2, NT2)) | NT1 is in cyk_matrix[i_1][j_1], NT2 is in cyk_matrix[i_2][j_2], in the input grammar there is the rule NT -> NT1 NT2}, for i < j cyk_matrix[i][i] := {(NT)} For implementation purpose the matrix will contain dictionaries. In particular, for each cell of the matrix there will be a corresponding dictionary which will contain, as keys, all non-terminals that can be written in that cell, as well as a "pos" value. The "pos" key will have a value (i, j) indicating the position of the dictionary inside the matrix. The values associated with each non terminal symbol instead will be a list of tuples, where each tuple is defined as mentioned above. """ # ------------------------- class CYKParser: def __init__(self, grammar=None, inputString=None): # the chart used for parsing self.parse_table = None self.grammar = grammar self.inputString = inputString """' This function is used to fill a new cell of the parse table by using two previously filled cells. The cells are represented as dictionaries. :param left_cell: dictionary representing the left_cell. :param right_cell: dictionary representing the right_cell. :param new_cell: dictionary representing the new cell to be filled. """ def __fill_parse_table_cell(self, new_cell, left_cell, right_cell): i, j = new_cell["pos"] if i == j: # simple case: the new cell is in the main diagonal. # # in this case we only have to check if the are # non-terminals that match directly to our terminal of # interest, which is the i-th (or j-th) character of the # input string. for key in left_cell: if key != "pos": rhss_to_match = key else: left_nts = left_cell.keys()[:] right_nts = right_cell.keys()[:] left_nts.remove("pos"), right_nts.remove("pos") # compute the cartesian product between the non-terminals # of left_cell and the non-terminals of right_cell. rhss_to_match = [] for left_nt in left_nts: for right_nt in right_nts: rhss_to_match.append([left_nt, right_nt]) # iterate the grammar and find all the production in the # grammar with the matching RHS (Right-Hand-Side) for nt in self.grammar: for rhs in rhss_to_match: if rhs in self.grammar[nt]: if i == j: new_cell[nt] = rhs else: nt1 = rhs[0] i_1, j_1 = left_cell['pos'] nt2 = rhs[1] i_2, j_2 = right_cell['pos'] if nt not in new_cell.keys(): # In order to build a parse tree we also # have to add information about the # non-terminals used to derive to the new # non-terminal and their respective # positions in the parse table. new_cell[nt] = [((nt1, i_1, j_1), (nt2, i_2, j_2))] else: # Save different ways of getting the same # non-terminal in order to build all # possible parse trees for a given sentence. new_cell[nt] = new_cell[nt] + [((nt1, i_1, j_1), (nt2, i_2, j_2))] return # parse the input string using the given CNF grammar. def parse(self): if self.grammar == None or self.inputString == None: # NOTE: careful, self.parse_table is still None! return n = len(self.inputString) self.parse_table = np.zeros((n, n), dtype=object) # Init step: fill the main diagonal for k in range(0, n): newCell = {"pos": (k, k)} self.__fill_parse_table_cell(newCell, {self.inputString[k]: [], "pos": (-1, -1)}, {"pos": (-1, -1)}) self.parse_table[k][k] = newCell # Parsing step: fill the other diagonals for k in range(1, n): for j in range(k, n): # fill k-th upper diagonal newCell = {"pos": (j-k, j)} for i in range(0, k): self.__fill_parse_table_cell(newCell, self.parse_table[j-k][i+j-k], self.parse_table[i+1+j-k][j]) self.parse_table[j-k][j] = newCell return def __get_first_parse_tree_rec(self, nt, i, j): if i == j: node = Node(nt) leaf = Node(self.parse_table[i][j][nt]) node.left = leaf return node else: # consider only first possible way of transforming nt to # get the sub-string associated with (i, j) (nt1, i_1, j_1), (nt2, i_2, j_2) = self.parse_table[i][j][nt][0] node = Node(nt) node.left = self.__get_first_parse_tree_rec(nt1, i_1, j_1) node.right = self.__get_first_parse_tree_rec(nt2, i_2, j_2) return node def getFirstParseTree(self): if self.parse_table is None: self.parse() if self.parse_table is None: # cannot parse yet return None elif "S" not in self.parse_table[0][len(self.inputString) - 1].keys(): # string is not generated by the grammar return None else: root = self.__get_first_parse_tree_rec("S", 0, len(self.inputString) -1) return ParseTree(root) def __get_all_parse_trees_rec(self, nt, i, j): if i == j: node = Node(nt) leaf = Node(self.parse_table[i][j][nt]) node.left = leaf return [node] else: res = [] # for every possible way of expanding nt in # # nt -> nt_left nt_right # for value in self.parse_table[i][j][nt]: (nt_left, i_left, j_left), (nt_right, i_right, j_right) = value # count number of subtrees with root nt_left that # cover the string associated with (i_left, j_left) # and do the same for nt_right. left_sub_trees = self.__get_all_parse_trees_rec(nt_left, i_left, j_left) right_sub_trees = self.__get_all_parse_trees_rec(nt_right, i_right, j_right) # then combine each possible combination of subtree # rooted at nt_left and subtree rooted at nt_right to # get tree rooted at nt that covers the string # associated with (i, j). for i in range(0, len(left_sub_trees)): for j in range(0, len(right_sub_trees)): node = Node(nt) node.left = left_sub_trees[i] node.right = right_sub_trees[j] res.append(node) # build the parse trees object for every tree created if nt == "S": trees = [] for root in res: trees.append(ParseTree(root)) return trees else: return res # return all possible parse trees for the given string and the # given CNF grammar. # # TODO: test this function def getAllParseTrees(self): if self.parse_table is None: self.parse() if self.parse_table is None: # cannot parse yet return None elif "S" not in self.parse_table[0][len(self.inputString) - 1].keys(): # string not generated by the grammar return None else: return self.__get_all_parse_trees_rec("S", 0, len(self.inputString) - 1) # returns the number of valid parse trees that have root the non # terminal nt and that parse the sub-string associated to cell (i, # j) in the parse_table. def __count_parse_trees_rec(nt, i, j): if i == j: return 1 else: total_count = 0 # let C(nt) be the number of valid parse trees that parse # the sub-string associated with (i, j). Then, it can be # proved that # # C(nt) = \sum_{nt -> nt_left nt_right} C(nt_left) * C(nt_right) # # NOTE: to be formally correct, extra care should be put # to specify the indexes. for value in self.parse_table[i][j][nt]: (nt_left, i_left, j_left), (nt_right, i_right, j_right) = value left_count = self.__count_parse_trees_rec(nt_left, i_left, j_left) right_count = self.__count_parse_trees_rec(nt_right, i_right, j_right) total_count += left_count * right_count return total_count # returns the number of valid parse trees that can be constructed # using the given CNF grammar to parse the given string. # # TODO: test this function def countParseTrees(self): if self.parse_table is None: self.parse() if self.parse_table is None: return -1 elif "S" not in self.parse_table[0][len(self.inputString) - 1].keys(): return 0 else: return self.__count_parse_trees_rec("S", 0, len(self.inputString) - 1) # TODO: test this def readGrammarFromFile(filename): self.grammar = {} with open(filename, 'r') as f: for line in f: symbols = line.split(" -> ") prod_lh = symbols[0].strip() prod_rh = symbols[1].strip().split(" ") # found a rule of the form NT -> T # NOTE: normalize it using lower case if len(prod_rh) == 1: prod_rh = prod_rh[0].lower() elif len(prod_rh) > 2: print("Error, grammar not in CNF form: " + prod_rh) return "-1" # add rule prod_rh -> prod_lf to the grammar if prod_lh not in self.grammar: self.grammar[prod_lh] = [prod_rh] else: self.grammar[prod_lh].append(prod_rh) return def readSentence(string): self.inputString = string # -------------- Ends classes definition area -------------- # -------------- Code production area -------------- def read_sentences(filename): sentences = [] with open(filename, 'r') as f: line = f.readline() while line: s = line.split("\t") for i in range(0, len(s)): s[i] = s[i].strip().lower() sentences.append(s) line = f.readline() return sentences # Computes some statistics for a given grammar, such as: # # - number of binary rules (A -> BC) # - number of lexical rules (A -> a) # - number of non-terminals # - number of terminals # def compute_grammar_statistics(grammar): binary_rules = 0 lexical_rules = 0 terminals_found = set() for nt in g1: for lhs in g1[nt]: if isinstance(lhs, list): binary_rules += 1 else: terminals_found.add(lhs) lexical_rules += 1 return (binary_rules, lexical_rules, terminals_found) # -------------- Code testing area -------------- # --------- test 1 grammar = {} grammar["S"] = [["A", "B"], ["B", "C"]] grammar["A"] = [["B", "C"], "a"] grammar["B"] = [["B", "B"], ["C", "C"], "b"] grammar["C"] = ["a"] string_1 = "baaa" string_3 = "aabaaa" CYK = CYKParser(grammar, string_1) CYK.parse() t = CYK.getFirstParseTree() t.labelTree() t.writeTreeDotfile("tests/test.dot") # --------- test 2 grammar = {} grammar["S"] = [["A", "B"], ["A", "C"]] grammar["A"] = [["C", "D"], ["D", "B"]] grammar["B"] = ["b", "c"] grammar["C"] = ["a", "c"] grammar["D"] = ["b"] string = "abc" CYK = CYKParser(grammar, string) CYK.parse() trees = CYK.getAllParseTrees() for i in range(0, len(trees)): tree = trees[i] tree.labelTree() tree.writeTreeDotfile("tests/prova" + str(i) + ".dot")