"""
Generate a list of random sentences of a given derivation depth from a
context-free grammar. Code by Dana Angluin. The main function is
random_sentences.
"""
from __future__ import division
import random
import nltk.grammar
from nltk import CFG
from nltk.parse.generate import generate
[docs]def random_sentences(count, depth, gr):
"""
Generates a number of random sentences using derivations of at most
a given depth.
:type count: int
:param count: The number of sentences to generate
:type depth: int
:param depth: The maximum derivation depth of a generated sentence
:type gr: CFG
:param gr: A context-free grammar to generate from
:type: list
:return: The generated sentences
"""
nonterminals = nonterminals_from_grammar(gr)
gr_table = make_table(depth, gr)
sentences = [random_from_form([gr.start()], depth, gr_table,
nonterminals, gr) for _ in xrange(count)]
return sentences
[docs]def make_table(depth, gr):
"""
For each production p of a context-free grammar and each number k,
this function computes the number of terminal strings whose
derivations
- invoke the production p as the first step and
- have depth at most k.
:type depth: int
:param depth: The maximum possible value of k (see above)
:type gr: CFG
:param gr: A context-free grammar
:rtype: dict
:return: For each production p and number k, the return dict maps
the tuple (p, k) to the number described above
"""
productions = gr.productions()
table = {}
for prod in productions:
table[(prod, 0)] = 0
if is_terminal_production(prod, gr):
table[(prod, 1)] = 1
else:
table[(prod, 1)] = 0
for k in xrange(2, depth + 1):
for prod in productions:
table[(prod, k)] = count_production_depth(prod, k, table, gr)
return table
""" Helper Functions """
[docs]def remove_duplicates(lst):
"""
Removes duplicates from a given list.
:type lst: list
:param lst: A list
:rtype: list
:return: lst, but with duplicates removed
"""
return list(set(lst))
[docs]def all_not_in(lst1, lst2):
"""
Determines whether a list contains an element of another list.
:type lst1: list
:param lst1: A list
:type lst2: list
:param lst2: Another list
:rtype: bool
:return: True if no element of lst1 is in list2, False otherwise
"""
return set(lst1).isdisjoint(lst2)
[docs]def select_from_dist(prob):
"""
This function chooses a random number according to a given
probability distribution.
:type prob: list
:param prob: A probability distribution represented as a list. For
each number i, prob[i] is the probability that this function
returns i. For example, if [.2, .3, .5] is passed to this
parameter, then there is a 20% chance of returning 0, a 30%
chance of returning 1, and a 50% chance of returning 2.
:rtype: int
:return: The number chosen
"""
x = random.random()
cumulative = 0.0
for i in xrange(len(prob)):
if x <= cumulative + prob[i]:
return i
else:
cumulative += prob[i]
return len(prob)
[docs]def all_lhs_from_grammar(gr):
"""
Finds the left-hand sides of all productions in a context-free
grammar. The return value may contain duplicates.
:type gr: CFG
:param gr: A context-free grammar
:rtype: list
:return: All the nonterminals appearing in the left-hand side of a
production of gr
"""
return [prod.lhs() for prod in gr.productions()]
[docs]def nonterminals_from_grammar(gr):
"""
Finds the left-hand sides of all productions in a context-free
grammar. The return value does not contain duplicates.
:type gr: CFG
:param gr: A context-free grammar
:rtype: list
:return: All the nonterminals appearing in the left-hand side of a
production of gr
"""
return remove_duplicates(all_lhs_from_grammar(gr))
[docs]def is_terminal_production(prod, gr):
"""
Determins whether or not a rule contains nonterminals in its right-
hand side.
:type prod: nltk.grammar.Production
:param prod: A production
:type gr: CFG
:param gr: A CFG
:rtype: bool
:return: True if there are no nonterminals in the right-hand side of
prod, False otherwise
"""
return all(nltk.grammar.is_terminal(n) for n in prod.rhs())
[docs]def count_nonterminal_depth(nonterminal, depth, table, gr):
"""
A helper function for make_table. This function computes the number
of terminal strings generable from a nonterminal using a derivation
of at most a given depth. This function assumes that the number for
the previous depth has already been computed.
:type nonterminal: nltk.grammar.Nonterminal
:param nonterminal: The nonterminal from which generable strings are
considered
:type depth: int
:param depth: The maximum depth of derivations considered
:type table: dict
:param table: A table containing the results obtained from this
function for the previous depth. The format of this table is the
same as the return value of make_table
:type gr: CFG
:param gr: The grammar whose generable strings are being considered
:rtype: int
:return: The number of strings generable by gr from nonterminal
using derivations at most depth-many layers deep
"""
nt_prods = gr.productions(lhs=nonterminal)
total = 0
for prod in nt_prods:
total += table[(prod, depth)]
return total
[docs]def count_production_depth(prod, depth, table, gr):
"""
A helper function for make_table. This function computes the number
of terminal strings generable using a derivation of at most a given
depth that invokes a given production as its first step. This
function assumes that the number for the previous depth has already
been computed.
:type prod: nltk.grammar.Production
:param prod: The production invoked during the first step of the
derivations considered
:type depth: int
:param depth: The maximum depth of derivations considered
:type table: dict
:param table: A table containing the results obtained from this
function for the previous depth. The format of this table is the
same as the return value of make_table
:type gr: CFG
:param gr: The grammar whose generable strings are being considered
:rtype: int
:return: The number of strings generable by gr using derivations at
most depth-many layers deep using prod as their first step
"""
result = 1
for item in prod.rhs():
if nltk.grammar.is_nonterminal(item):
result *= count_nonterminal_depth(item, depth - 1, table, gr)
return result
[docs]def choose_production(nt, depth, table, gr):
"""
Randomly chooses a production with a given left-hand side nt. The
probability of each production p is the proportion of strings
generable from nt with derivations of at most a given depth that
have derivations invoking p in the first step.
:type nt: nltk.grammar.Nonterminal
:param nt: The left-hand side of the production chosen
:type depth: int
:param depth: The maximum depth of derivations considered
:type table: dict
:param table: A table computed by make_table (see make_table and
count_production_depth)
:type gr: CFG
:param gr: The context-free grammar from which the productions are
drawn
:rtype: nltk.grammar.Production
:return: The chosen production
"""
prods = gr.productions(lhs=nt)
counts = []
total = 0.
for prod in prods:
count = count_production_depth(prod, depth, table, gr)
counts += [count]
total += count
counts = [i / total for i in counts]
k = select_from_dist(counts)
return prods[k]
if __name__ == "__main__":
######################################################
# some test grammars
dyck_grammar = CFG.fromstring("""
S -> S S
S -> '(' S ')' | '(' ')'
S -> '[' S ']' | '[' ']'
""")
unambig_agreement_grammar = CFG.fromstring("""
S -> NPsing "Auxsing"
S -> NPplur "Auxplur"
NP -> NPsing
NP -> NPplur
NPsing -> "Det" "Nsing" PP
NPplur -> "Det" "Nplur" PP
NPsing -> "Det" "Nsing" Relsing
NPplur -> "Det" "Nplur" Relplur
NPsing -> "Det" "Nsing"
NPplur -> "Det" "Nplur"
PP -> "Prep" NP
Relsing -> "Rel" "Auxsing" VP
Relsing -> Relobj
Relplur -> "Rel" "Auxplur" VP
Relplur -> Relobj
Relobj -> "Rel" NPsing "Auxsing" "Vtrans"
Relobj -> "Rel" NPplur "Auxplur" "Vtrans"
VP -> "Vintrans"
VP -> "Vtrans" NP
""")
exp_eval_grammar = CFG.fromstring("""
S -> Strue | Sfalse
Strue -> "T" "1"
Strue -> Strue Strue "and" "1"
Strue -> Strue Strue "or" "1" | Strue Sfalse "or" "1"
Strue -> Sfalse Strue "or" "1"
Sfalse -> "F" "0"
Sfalse -> Sfalse Sfalse "or" "0"
Sfalse -> Sfalse Sfalse "and" "0" | Sfalse Strue "and" "0"
Sfalse -> Strue Sfalse "and" "0"
""")
# to save typing
dgr = dyck_grammar
uagr = unambig_agreement_grammar
eegr = exp_eval_grammar
# comparisons of table calculations and reported sample sizes
# Note: the generate function from nltk uses a notion of
# depth that is 1 more than that used above!
# NB: the dyck_grammar is NOT unambiguous (S -> S S)
dgr_table = make_table(6, dgr)
print "dyck_grammar for 4 from count_nonterminal_depth"
print count_nonterminal_depth(dgr.start(), 4, dgr_table, dgr)
print "nltk generate: number of sentences for dyck grammar at depth = 5"
print len(list(generate(dgr, depth=5)))
print "The dyck_grammar is ambiguous!"
# unambig_agreement_grammar
# this agrees with the count for depth = 16 in generate
uagr_table = make_table(15, uagr)
print "unambig_agreement_grammar for 15 from count_nonterminal_depth"
print count_nonterminal_depth(uagr.start(), 15, uagr_table, uagr)
# exp_eval_grammar
# this agrees with the count for depth = 6 in generate
eegr_table = make_table(5, eegr)
print "exp_eval_grammar for 5 from count_nonterminal_depth"
print count_nonterminal_depth(eegr.start(), 5, eegr_table, eegr)
print "number of nltk depth = 7 sentences from dyck_grammar"
print count_nonterminal_depth(dgr.start(), 6, dgr_table, dgr)
print "a random sentence nltk depth = 7 from dyck_grammar"
print random_sentences(1, 6, dgr)
sentences100 = random_sentences(100, 6, dgr)
print "maximum length among 100 nltk depth = 7 sentences from dyck_grammar"
print max([len(x) for x in sentences100])