Source code for nltk.parse.earleychart

# -*- coding: utf-8 -*-
# Natural Language Toolkit: An Incremental Earley Chart Parser
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
#         Rob Speer <rspeer@mit.edu>
#         Edward Loper <edloper@gradient.cis.upenn.edu>
#         Steven Bird <sb@csse.unimelb.edu.au>
#         Jean Mark Gawron <gawron@mail.sdsu.edu>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
Data classes and parser implementations for *incremental* chart
parsers, which use dynamic programming to efficiently parse a text.
A "chart parser" derives parse trees for a text by iteratively adding
\"edges\" to a \"chart\".  Each "edge" represents a hypothesis about the tree
structure for a subsequence of the text.  The "chart" is a
\"blackboard\" for composing and combining these hypotheses.

A parser is "incremental", if it guarantees that for all i, j where i < j,
all edges ending at i are built before any edges ending at j.
This is appealing for, say, speech recognizer hypothesis filtering.

The main parser class is ``EarleyChartParser``, which is a top-down
algorithm, originally formulated by Jay Earley (1970).
"""

from __future__ import print_function
from nltk.parse.chart import (Chart, ChartParser, EdgeI, LeafEdge, LeafInitRule,
                              BottomUpPredictRule, BottomUpPredictCombineRule,
                              TopDownInitRule, SingleEdgeFundamentalRule,
                              EmptyPredictRule,
                              CachedTopDownPredictRule,
                              FilteredSingleEdgeFundamentalRule,
                              FilteredBottomUpPredictCombineRule)
from nltk.parse.featurechart import (FeatureChart, FeatureChartParser,
                                     FeatureTopDownInitRule,
                                     FeatureTopDownPredictRule,
                                     FeatureEmptyPredictRule,
                                     FeatureBottomUpPredictRule,
                                     FeatureBottomUpPredictCombineRule,
                                     FeatureSingleEdgeFundamentalRule)

#////////////////////////////////////////////////////////////
# Incremental Chart
#////////////////////////////////////////////////////////////

[docs]class IncrementalChart(Chart):
[docs] def initialize(self): # A sequence of edge lists contained in this chart. self._edgelists = tuple([] for x in self._positions()) # The set of child pointer lists associated with each edge. self._edge_to_cpls = {} # Indexes mapping attribute values to lists of edges # (used by select()). self._indexes = {}
[docs] def edges(self): return list(self.iteredges())
[docs] def iteredges(self): return (edge for edgelist in self._edgelists for edge in edgelist)
[docs] def select(self, end, **restrictions): edgelist = self._edgelists[end] # If there are no restrictions, then return all edges. if restrictions=={}: return iter(edgelist) # Find the index corresponding to the given restrictions. restr_keys = restrictions.keys() restr_keys.sort() restr_keys = tuple(restr_keys) # If it doesn't exist, then create it. if restr_keys not in self._indexes: self._add_index(restr_keys) vals = tuple(restrictions[key] for key in restr_keys) return iter(self._indexes[restr_keys][end].get(vals, []))
def _add_index(self, restr_keys): # Make sure it's a valid index. for key in restr_keys: if not hasattr(EdgeI, key): raise ValueError('Bad restriction: %s' % key) # Create the index. index = self._indexes[restr_keys] = tuple({} for x in self._positions()) # Add all existing edges to the index. for end, edgelist in enumerate(self._edgelists): this_index = index[end] for edge in edgelist: vals = tuple(getattr(edge, key)() for key in restr_keys) this_index.setdefault(vals, []).append(edge) def _register_with_indexes(self, edge): end = edge.end() for (restr_keys, index) in self._indexes.items(): vals = tuple(getattr(edge, key)() for key in restr_keys) index[end].setdefault(vals, []).append(edge) def _append_edge(self, edge): self._edgelists[edge.end()].append(edge) def _positions(self): return xrange(self.num_leaves() + 1)
[docs]class FeatureIncrementalChart(IncrementalChart, FeatureChart):
[docs] def select(self, end, **restrictions): edgelist = self._edgelists[end] # If there are no restrictions, then return all edges. if restrictions=={}: return iter(edgelist) # Find the index corresponding to the given restrictions. restr_keys = restrictions.keys() restr_keys.sort() restr_keys = tuple(restr_keys) # If it doesn't exist, then create it. if restr_keys not in self._indexes: self._add_index(restr_keys) vals = tuple(self._get_type_if_possible(restrictions[key]) for key in restr_keys) return iter(self._indexes[restr_keys][end].get(vals, []))
def _add_index(self, restr_keys): # Make sure it's a valid index. for key in restr_keys: if not hasattr(EdgeI, key): raise ValueError('Bad restriction: %s' % key) # Create the index. index = self._indexes[restr_keys] = tuple({} for x in self._positions()) # Add all existing edges to the index. for end, edgelist in enumerate(self._edgelists): this_index = index[end] for edge in edgelist: vals = tuple(self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys) this_index.setdefault(vals, []).append(edge) def _register_with_indexes(self, edge): end = edge.end() for (restr_keys, index) in self._indexes.items(): vals = tuple(self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys) index[end].setdefault(vals, []).append(edge) #//////////////////////////////////////////////////////////// # Incremental CFG Rules #////////////////////////////////////////////////////////////
[docs]class CompleteFundamentalRule(SingleEdgeFundamentalRule): def _apply_incomplete(self, chart, grammar, left_edge): end = left_edge.end() # When the chart is incremental, we only have to look for # empty complete edges here. for right_edge in chart.select(start=end, end=end, is_complete=True, lhs=left_edge.next()): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge
[docs]class CompleterRule(CompleteFundamentalRule): _fundamental_rule = CompleteFundamentalRule()
[docs] def apply_iter(self, chart, grammar, edge): if not isinstance(edge, LeafEdge): for new_edge in self._fundamental_rule.apply_iter(chart, grammar, edge): yield new_edge
[docs]class ScannerRule(CompleteFundamentalRule): _fundamental_rule = CompleteFundamentalRule()
[docs] def apply_iter(self, chart, grammar, edge): if isinstance(edge, LeafEdge): for new_edge in self._fundamental_rule.apply_iter(chart, grammar, edge): yield new_edge
[docs]class PredictorRule(CachedTopDownPredictRule): pass
[docs]class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule):
[docs] def apply_iter(self, chart, grammar, edge): # Since the Filtered rule only works for grammars without empty productions, # we only have to bother with complete edges here. if edge.is_complete(): for new_edge in self._apply_complete(chart, grammar, edge): yield new_edge #//////////////////////////////////////////////////////////// # Incremental FCFG Rules #////////////////////////////////////////////////////////////
[docs]class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule): def _apply_incomplete(self, chart, grammar, left_edge): fr = self._fundamental_rule end = left_edge.end() # When the chart is incremental, we only have to look for # empty complete edges here. for right_edge in chart.select(start=end, end=end, is_complete=True, lhs=left_edge.next()): for new_edge in fr.apply_iter(chart, grammar, left_edge, right_edge): yield new_edge
[docs]class FeatureCompleterRule(CompleterRule): _fundamental_rule = FeatureCompleteFundamentalRule()
[docs]class FeatureScannerRule(ScannerRule): _fundamental_rule = FeatureCompleteFundamentalRule()
[docs]class FeaturePredictorRule(FeatureTopDownPredictRule): pass #//////////////////////////////////////////////////////////// # Incremental CFG Chart Parsers #////////////////////////////////////////////////////////////
EARLEY_STRATEGY = [LeafInitRule(), TopDownInitRule(), CompleterRule(), ScannerRule(), PredictorRule()] TD_INCREMENTAL_STRATEGY = [LeafInitRule(), TopDownInitRule(), CachedTopDownPredictRule(), CompleteFundamentalRule()] BU_INCREMENTAL_STRATEGY = [LeafInitRule(), EmptyPredictRule(), BottomUpPredictRule(), CompleteFundamentalRule()] BU_LC_INCREMENTAL_STRATEGY = [LeafInitRule(), EmptyPredictRule(), BottomUpPredictCombineRule(), CompleteFundamentalRule()] LC_INCREMENTAL_STRATEGY = [LeafInitRule(), FilteredBottomUpPredictCombineRule(), FilteredCompleteFundamentalRule()]
[docs]class IncrementalChartParser(ChartParser): """ An *incremental* chart parser implementing Jay Earley's parsing algorithm: | For each index end in [0, 1, ..., N]: | For each edge such that edge.end = end: | If edge is incomplete and edge.next is not a part of speech: | Apply PredictorRule to edge | If edge is incomplete and edge.next is a part of speech: | Apply ScannerRule to edge | If edge is complete: | Apply CompleterRule to edge | Return any complete parses in the chart """ def __init__(self, grammar, strategy=BU_LC_INCREMENTAL_STRATEGY, trace=0, trace_chart_width=50, chart_class=IncrementalChart): """ Create a new Earley chart parser, that uses ``grammar`` to parse texts. :type grammar: ContextFreeGrammar :param grammar: The grammar used to parse texts. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. :type trace_chart_width: int :param trace_chart_width: The default total width reserved for the chart in trace output. The remainder of each line will be used to display edges. :param chart_class: The class that should be used to create the charts used by this parser. """ self._grammar = grammar self._trace = trace self._trace_chart_width = trace_chart_width self._chart_class = chart_class self._axioms = [] self._inference_rules = [] for rule in strategy: if rule.NUM_EDGES == 0: self._axioms.append(rule) elif rule.NUM_EDGES == 1: self._inference_rules.append(rule) else: raise ValueError("Incremental inference rules must have " "NUM_EDGES == 0 or 1")
[docs] def chart_parse(self, tokens, trace=None): if trace is None: trace = self._trace trace_new_edges = self._trace_new_edges tokens = list(tokens) self._grammar.check_coverage(tokens) chart = self._chart_class(tokens) grammar = self._grammar # Width, for printing trace edges. trace_edge_width = self._trace_chart_width / (chart.num_leaves() + 1) if trace: print(chart.pp_leaves(trace_edge_width)) for axiom in self._axioms: new_edges = axiom.apply(chart, grammar) trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) inference_rules = self._inference_rules for end in range(chart.num_leaves()+1): if trace > 1: print("\n* Processing queue:", end, "\n") agenda = list(chart.select(end=end)) while agenda: edge = agenda.pop() for rule in inference_rules: new_edges = rule.apply_iter(chart, grammar, edge) if trace: new_edges = list(new_edges) trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) for new_edge in new_edges: if new_edge.end()==end: agenda.append(new_edge) return chart
[docs]class EarleyChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args) pass
[docs]class IncrementalTopDownChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__(self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args)
[docs]class IncrementalBottomUpChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__(self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args)
[docs]class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__(self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args)
[docs]class IncrementalLeftCornerChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): if not grammar.is_nonempty(): raise ValueError("IncrementalLeftCornerParser only works for grammars " "without empty productions.") IncrementalChartParser.__init__(self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args) #//////////////////////////////////////////////////////////// # Incremental FCFG Chart Parsers #////////////////////////////////////////////////////////////
EARLEY_FEATURE_STRATEGY = [LeafInitRule(), FeatureTopDownInitRule(), FeatureCompleterRule(), FeatureScannerRule(), FeaturePredictorRule()] TD_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(), FeatureTopDownInitRule(), FeatureTopDownPredictRule(), FeatureCompleteFundamentalRule()] BU_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictRule(), FeatureCompleteFundamentalRule()] BU_LC_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictCombineRule(), FeatureCompleteFundamentalRule()]
[docs]class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser): def __init__(self, grammar, strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY, trace_chart_width=20, chart_class=FeatureIncrementalChart, **parser_args): IncrementalChartParser.__init__(self, grammar, strategy=strategy, trace_chart_width=trace_chart_width, chart_class=chart_class, **parser_args)
[docs]class FeatureEarleyChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__(self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args)
[docs]class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__(self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args)
[docs]class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__(self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args)
[docs]class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__(self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args) #//////////////////////////////////////////////////////////// # Demonstration #////////////////////////////////////////////////////////////
[docs]def demo(should_print_times=True, should_print_grammar=False, should_print_trees=True, trace=2, sent='I saw John with a dog with my cookie', numparses=5): """ A demonstration of the Earley parsers. """ import sys, time from nltk.parse.chart import demo_grammar # The grammar for ChartParser and SteppingChartParser: grammar = demo_grammar() if should_print_grammar: print("* Grammar") print(grammar) # Tokenize the sample sentence. print("* Sentence:") print(sent) tokens = sent.split() print(tokens) print() # Do the parsing. earley = EarleyChartParser(grammar, trace=trace) t = time.clock() chart = earley.chart_parse(tokens) parses = chart.parses(grammar.start()) t = time.clock()-t # Print results. if numparses: assert len(parses)==numparses, 'Not all parses found' if should_print_trees: for tree in parses: print(tree) else: print("Nr trees:", len(parses)) if should_print_times: print("Time:", t)
if __name__ == '__main__': demo()