# Natural Language Toolkit: Glue Semantics
#
# Author: Dan Garrette <dhgarrette@gmail.com>
#
# Copyright (C) 2001-2012 NLTK Project
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import print_function
import os
import nltk
from nltk.internals import Counter
from nltk.corpus import brown
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger
from nltk.sem.logic import (LogicParser, Expression, Variable, VariableExpression,
LambdaExpression, AbstractVariableExpression)
import drt
import linearlogic
SPEC_SEMTYPES = {'a' : 'ex_quant',
'an' : 'ex_quant',
'every' : 'univ_quant',
'the' : 'def_art',
'no' : 'no_quant',
'default' : 'ex_quant'}
OPTIONAL_RELATIONSHIPS = ['nmod', 'vmod', 'punct']
[docs]class GlueDict(dict):
def __init__(self, filename):
self.filename = filename
self.read_file()
[docs] def read_file(self, empty_first=True):
if empty_first:
self.clear()
try:
f = nltk.data.find(
os.path.join('grammars', 'sample_grammars', self.filename))
# if f is a ZipFilePathPointer or a FileSystemPathPointer
# then we need a little extra massaging
if hasattr(f, 'open'):
f = f.open()
except LookupError as e:
try:
f = open(self.filename)
except LookupError:
raise e
lines = f.readlines()
f.close()
for line in lines: # example: 'n : (\\x.(<word> x), (v-or))'
# lambdacalc -^ linear logic -^
line = line.strip() # remove trailing newline
if not len(line): continue # skip empty lines
if line[0] == '#': continue # skip commented out lines
parts = line.split(' : ', 2) # ['verb', '(\\x.(<word> x), ( subj -o f ))', '[subj]']
glue_formulas = []
parenCount = 0
tuple_start = 0
tuple_comma = 0
relationships = None
if len(parts) > 1:
for (i, c) in enumerate(parts[1]):
if c == '(':
if parenCount == 0: # if it's the first '(' of a tuple
tuple_start = i+1 # then save the index
parenCount += 1
elif c == ')':
parenCount -= 1
if parenCount == 0: # if it's the last ')' of a tuple
meaning_term = parts[1][tuple_start:tuple_comma] # '\\x.(<word> x)'
glue_term = parts[1][tuple_comma+1:i] # '(v-r)'
glue_formulas.append([meaning_term, glue_term]) # add the GlueFormula to the list
elif c == ',':
if parenCount == 1: # if it's a comma separating the parts of the tuple
tuple_comma = i # then save the index
elif c == '#': # skip comments at the ends of lines
if parenCount != 0: # if the line hasn't parsed correctly so far
raise RuntimeError('Formula syntax is incorrect for entry ' + line)
break # break to the next line
if len(parts) > 2: #if there is a relationship entry at the end
relStart = parts[2].index('[')+1
relEnd = parts[2].index(']')
if relStart == relEnd:
relationships = frozenset()
else:
relationships = frozenset([r.strip() for r in parts[2][relStart:relEnd].split(',')])
try:
startInheritance = parts[0].index('(')
endInheritance = parts[0].index(')')
sem = parts[0][:startInheritance].strip()
supertype = parts[0][startInheritance+1:endInheritance]
except:
sem = parts[0].strip()
supertype = None
if sem not in self:
self[sem] = {}
if relationships is None: #if not specified for a specific relationship set
#add all relationship entries for parents
if supertype:
for rels, glue in self[supertype].iteritems():
if rels not in self[sem]:
self[sem][rels] = []
self[sem][rels].extend(glue)
self[sem][rels].extend(glue_formulas) # add the glue formulas to every rel entry
else:
if None not in self[sem]:
self[sem][None] = []
self[sem][None].extend(glue_formulas) # add the glue formulas to every rel entry
else:
if relationships not in self[sem]:
self[sem][relationships] = []
if supertype:
self[sem][relationships].extend(self[supertype][relationships])
self[sem][relationships].extend(glue_formulas) # add the glue entry to the dictionary
def __str__(self):
accum = ''
for pos in self:
for relset in self[pos]:
i = 1
for gf in self[pos][relset]:
if i==1:
accum += str(pos) + ': '
else:
accum += ' '*(len(str(pos))+2)
accum += str(gf)
if relset and i==len(self[pos][relset]):
accum += ' : ' + str(relset)
accum += '\n'
i += 1
return accum
[docs] def lookup(self, node, depgraph, counter):
semtype_names = self.get_semtypes(node)
semtype = None
for name in semtype_names:
if name in self:
semtype = self[name]
break
if semtype is None:
# raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word)
return []
self.add_missing_dependencies(node, depgraph)
lookup = self._lookup_semtype_option(semtype, node, depgraph)
if not len(lookup):
raise KeyError("There is no GlueDict entry for sem type of '%s'"\
" with tag '%s', and rel '%s'" %\
(node['word'], node['tag'], node['rel']))
return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
[docs] def add_missing_dependencies(self, node, depgraph):
rel = node['rel'].lower()
if rel == 'main':
headnode = depgraph.nodelist[node['head']]
subj = self.lookup_unique('subj', headnode, depgraph)
node['deps'].append(subj['address'])
def _lookup_semtype_option(self, semtype, node, depgraph):
relationships = frozenset([depgraph.nodelist[dep]['rel'].lower()
for dep in node['deps']
if depgraph.nodelist[dep]['rel'].lower()
not in OPTIONAL_RELATIONSHIPS])
try:
lookup = semtype[relationships]
except KeyError:
# An exact match is not found, so find the best match where
# 'best' is defined as the glue entry whose relationship set has the
# most relations of any possible relationship set that is a subset
# of the actual depgraph
best_match = frozenset()
for relset_option in set(semtype)-set([None]):
if len(relset_option) > len(best_match) and \
relset_option < relationships:
best_match = relset_option
if not best_match:
if None in semtype:
best_match = None
else:
return None
lookup = semtype[best_match]
return lookup
[docs] def get_semtypes(self, node):
"""
Based on the node, return a list of plausible semtypes in order of
plausibility.
"""
semtype_name = None
rel = node['rel'].lower()
word = node['word'].lower()
if rel == 'spec':
if word in SPEC_SEMTYPES:
return [SPEC_SEMTYPES[word]]
else:
return [SPEC_SEMTYPES['default']]
elif rel in ['nmod', 'vmod']:
return [node['tag'], rel]
else:
return [node['tag']]
[docs] def get_glueformulas_from_semtype_entry(self, lookup, word, node, depgraph, counter):
glueformulas = []
glueFormulaFactory = self.get_GlueFormula_factory()
for meaning, glue in lookup:
gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue)
if not len(glueformulas):
gf.word = word
else:
gf.word = '%s%s' % (word, len(glueformulas)+1)
gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get())
glueformulas.append(gf)
return glueformulas
[docs] def initialize_labels(self, expr, node, depgraph, unique_index):
if isinstance(expr, linearlogic.AtomicExpression):
name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index)
if name[0].isupper():
return linearlogic.VariableExpression(name)
else:
return linearlogic.ConstantExpression(name)
else:
return linearlogic.ImpExpression(
self.initialize_labels(expr.antecedent, node, depgraph, unique_index),
self.initialize_labels(expr.consequent, node, depgraph, unique_index))
[docs] def find_label_name(self, name, node, depgraph, unique_index):
try:
dot = name.index('.')
before_dot = name[:dot]
after_dot = name[dot+1:]
if before_dot == 'super':
return self.find_label_name(after_dot, depgraph.nodelist[node['head']], depgraph, unique_index)
else:
return self.find_label_name(after_dot, self.lookup_unique(before_dot, node, depgraph), depgraph, unique_index)
except ValueError:
lbl = self.get_label(node)
if name=='f': return lbl
elif name=='v': return '%sv' % lbl
elif name=='r': return '%sr' % lbl
elif name=='super': return self.get_label(depgraph.nodelist[node['head']])
elif name=='var': return '%s%s' % (lbl.upper(), unique_index)
elif name=='a': return self.get_label(self.lookup_unique('conja', node, depgraph))
elif name=='b': return self.get_label(self.lookup_unique('conjb', node, depgraph))
else: return self.get_label(self.lookup_unique(name, node, depgraph))
[docs] def get_label(self, node):
"""
Pick an alphabetic character as identifier for an entity in the model.
:param value: where to index into the list of characters
:type value: int
"""
value = node['address']
letter = ['f','g','h','i','j','k','l','m','n','o','p','q','r','s',
't','u','v','w','x','y','z','a','b','c','d','e'][value-1]
num = int(value) / 26
if num > 0:
return letter + str(num)
else:
return letter
[docs] def lookup_unique(self, rel, node, depgraph):
"""
Lookup 'key'. There should be exactly one item in the associated relation.
"""
deps = [depgraph.nodelist[dep] for dep in node['deps']
if depgraph.nodelist[dep]['rel'].lower() == rel.lower()]
if len(deps) == 0:
raise KeyError("'%s' doesn't contain a feature '%s'" % (node['word'], rel))
elif len(deps) > 1:
raise KeyError("'%s' should only have one feature '%s'" % (node['word'], rel))
else:
return deps[0]
[docs]class Glue(object):
def __init__(self, semtype_file=None, remove_duplicates=False,
depparser=None, verbose=False):
self.verbose = verbose
self.remove_duplicates = remove_duplicates
self.depparser = depparser
from nltk import Prover9
self.prover = Prover9()
if semtype_file:
self.semtype_file = semtype_file
else:
self.semtype_file = 'glue.semtype'
[docs] def train_depparser(self, depgraphs=None):
if depgraphs:
self.depparser.train(depgraphs)
else:
self.depparser.train_from_file(nltk.data.find(
os.path.join('grammars', 'sample_grammars',
'glue_train.conll')))
[docs] def parse_to_meaning(self, sentence):
readings = []
for agenda in self.parse_to_compiled(sentence):
readings.extend(self.get_readings(agenda))
return readings
[docs] def get_readings(self, agenda):
readings = []
agenda_length = len(agenda)
atomics = dict()
nonatomics = dict()
while agenda: # is not empty
cur = agenda.pop()
glue_simp = cur.glue.simplify()
if isinstance(glue_simp, linearlogic.ImpExpression): # if cur.glue is non-atomic
for key in atomics:
try:
if isinstance(cur.glue, linearlogic.ApplicationExpression):
bindings = cur.glue.bindings
else:
bindings = linearlogic.BindingDict()
glue_simp.antecedent.unify(key, bindings)
for atomic in atomics[key]:
if not (cur.indices & atomic.indices): # if the sets of indices are disjoint
try:
agenda.append(cur.applyto(atomic))
except linearlogic.LinearLogicApplicationException:
pass
except linearlogic.UnificationException:
pass
try:
nonatomics[glue_simp.antecedent].append(cur)
except KeyError:
nonatomics[glue_simp.antecedent] = [cur]
else: # else cur.glue is atomic
for key in nonatomics:
for nonatomic in nonatomics[key]:
try:
if isinstance(nonatomic.glue, linearlogic.ApplicationExpression):
bindings = nonatomic.glue.bindings
else:
bindings = linearlogic.BindingDict()
glue_simp.unify(key, bindings)
if not (cur.indices & nonatomic.indices): # if the sets of indices are disjoint
try:
agenda.append(nonatomic.applyto(cur))
except linearlogic.LinearLogicApplicationException:
pass
except linearlogic.UnificationException:
pass
try:
atomics[glue_simp].append(cur)
except KeyError:
atomics[glue_simp] = [cur]
for entry in atomics:
for gf in atomics[entry]:
if len(gf.indices) == agenda_length:
self._add_to_reading_list(gf, readings)
for entry in nonatomics:
for gf in nonatomics[entry]:
if len(gf.indices) == agenda_length:
self._add_to_reading_list(gf, readings)
return readings
def _add_to_reading_list(self, glueformula, reading_list):
add_reading = True
if self.remove_duplicates:
for reading in reading_list:
try:
if reading.equiv(glueformula.meaning, self.prover):
add_reading = False
break;
except Exception as e:
#if there is an exception, the syntax of the formula
#may not be understandable by the prover, so don't
#throw out the reading.
print('Error when checking logical equality of statements', e)
pass
if add_reading:
reading_list.append(glueformula.meaning)
[docs] def parse_to_compiled(self, sentence='a man sees Mary'.split()):
gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)]
return [self.gfl_to_compiled(gfl) for gfl in gfls]
[docs] def dep_parse(self, sentence='every cat leaves'.split()):
#Lazy-initialize the depparser
if self.depparser is None:
from nltk.parse import MaltParser
self.depparser = MaltParser(tagger=self.get_pos_tagger())
if not self.depparser._trained:
self.train_depparser()
return [self.depparser.parse(sentence, verbose=self.verbose)]
[docs] def depgraph_to_glue(self, depgraph):
return self.get_glue_dict().to_glueformula_list(depgraph)
[docs] def get_glue_dict(self):
return GlueDict(self.semtype_file)
[docs] def gfl_to_compiled(self, gfl):
index_counter = Counter()
return_list = []
for gf in gfl:
return_list.extend(gf.compile(index_counter))
if self.verbose:
print('Compiled Glue Premises:')
for cgf in return_list:
print(cgf)
return return_list
[docs] def get_pos_tagger(self):
regexp_tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
#Override particular words
main_tagger = RegexpTagger(
[(r'(A|a|An|an)$', 'ex_quant'),
(r'(Every|every|All|all)$', 'univ_quant')
], backoff=trigram_tagger)
return main_tagger
[docs]class DrtGlueDict(GlueDict):
[docs]class DrtGlue(Glue):
def __init__(self, semtype_file=None, remove_duplicates=False,
depparser=None, verbose=False):
if not semtype_file:
semtype_file = 'drt_glue.semtype'
Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose)
[docs] def get_glue_dict(self):
return DrtGlueDict(self.semtype_file)
[docs]def demo(show_example=-1):
from nltk.parse import MaltParser
examples = ['David sees Mary',
'David eats a sandwich',
'every man chases a dog',
'every man believes a dog sleeps',
'John gives David a sandwich',
'John chases himself']
# 'John persuades David to order a pizza',
# 'John tries to go',
# 'John tries to find a unicorn',
# 'John seems to vanish',
# 'a unicorn seems to approach',
# 'every big cat leaves',
# 'every gray cat leaves',
# 'every big gray cat leaves',
# 'a former senator leaves',
print('============== DEMO ==============')
tagger = RegexpTagger(
[('^(David|Mary|John)$', 'NNP'),
('^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'),
('^(go|order|vanish|find|approach)$', 'VB'),
('^(a)$', 'ex_quant'),
('^(every)$', 'univ_quant'),
('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'),
('^(big|gray|former)$', 'JJ'),
('^(him|himself)$', 'PRP')
])
depparser = MaltParser(tagger=tagger)
glue = Glue(depparser=depparser, verbose=False)
for (i, sentence) in enumerate(examples):
if i==show_example or show_example==-1:
print('[[[Example %s]]] %s' % (i, sentence))
for reading in glue.parse_to_meaning(sentence.split()):
print(reading.simplify())
print('')
if __name__ == '__main__':
demo()