Source code for formalisms.depth_generate

"""
Generate a list of random sentences of a given derivation depth from a
context-free grammar. Code by Dana Angluin. The main function is
random_sentences.
"""
from __future__ import division

import random

import nltk.grammar
from nltk import CFG
from nltk.parse.generate import generate


[docs]def random_sentences(count, depth, gr):
    """
    Generates a number of random sentences using derivations of at most
    a given depth.

    :type count: int
    :param count: The number of sentences to generate

    :type depth: int
    :param depth: The maximum derivation depth of a generated sentence

    :type gr: CFG
    :param gr: A context-free grammar to generate from

    :type: list
    :return: The generated sentences
    """
    nonterminals = nonterminals_from_grammar(gr)
    gr_table = make_table(depth, gr)
    sentences = [random_from_form([gr.start()], depth, gr_table,
                                  nonterminals, gr) for _ in xrange(count)]
    return sentences


[docs]def make_table(depth, gr):
    """
    For each production p of a context-free grammar and each number k,
    this function computes the number of terminal strings whose
    derivations
        - invoke the production p as the first step and
        - have depth at most k.

    :type depth: int
    :param depth: The maximum possible value of k (see above)

    :type gr: CFG
    :param gr: A context-free grammar

    :rtype: dict
    :return: For each production p and number k, the return dict maps
        the tuple (p, k) to the number described above
    """
    productions = gr.productions()
    table = {}
    for prod in productions:
        table[(prod, 0)] = 0
        if is_terminal_production(prod, gr):
            table[(prod, 1)] = 1
        else:
            table[(prod, 1)] = 0
    for k in xrange(2, depth + 1):
        for prod in productions:
            table[(prod, k)] = count_production_depth(prod, k, table, gr)
    return table


""" Helper Functions """


[docs]def remove_duplicates(lst):
    """
    Removes duplicates from a given list.

    :type lst: list
    :param lst: A list

    :rtype: list
    :return: lst, but with duplicates removed
    """
    return list(set(lst))


[docs]def all_not_in(lst1, lst2):
    """
    Determines whether a list contains an element of another list.

    :type lst1: list
    :param lst1: A list

    :type lst2: list
    :param lst2: Another list

    :rtype: bool
    :return: True if no element of lst1 is in list2, False otherwise
    """
    return set(lst1).isdisjoint(lst2)


[docs]def select_from_dist(prob):
    """
    This function chooses a random number according to a given
    probability distribution.

    :type prob: list
    :param prob: A probability distribution represented as a list. For
        each number i, prob[i] is the probability that this function
        returns i. For example, if [.2, .3, .5] is passed to this
        parameter, then there is a 20% chance of returning 0, a 30%
        chance of returning 1, and a 50% chance of returning 2.

    :rtype: int
    :return: The number chosen
    """
    x = random.random()
    cumulative = 0.0
    for i in xrange(len(prob)):
        if x <= cumulative + prob[i]:
            return i
        else:
            cumulative += prob[i]
    return len(prob)


[docs]def all_lhs_from_grammar(gr):
    """
    Finds the left-hand sides of all productions in a context-free
    grammar. The return value may contain duplicates.

    :type gr: CFG
    :param gr: A context-free grammar

    :rtype: list
    :return: All the nonterminals appearing in the left-hand side of a
        production of gr
    """
    return [prod.lhs() for prod in gr.productions()]


[docs]def nonterminals_from_grammar(gr):
    """
    Finds the left-hand sides of all productions in a context-free
    grammar. The return value does not contain duplicates.

    :type gr: CFG
    :param gr: A context-free grammar

    :rtype: list
    :return: All the nonterminals appearing in the left-hand side of a
        production of gr
    """
    return remove_duplicates(all_lhs_from_grammar(gr))


[docs]def is_terminal_production(prod, gr):
    """
    Determins whether or not a rule contains nonterminals in its right-
    hand side.

    :type prod: nltk.grammar.Production
    :param prod: A production

    :type gr: CFG
    :param gr: A CFG

    :rtype: bool
    :return: True if there are no nonterminals in the right-hand side of
        prod, False otherwise
    """
    return all(nltk.grammar.is_terminal(n) for n in prod.rhs())


[docs]def count_nonterminal_depth(nonterminal, depth, table, gr):
    """
    A helper function for make_table. This function computes the number
    of terminal strings generable from a nonterminal using a derivation
    of at most a given depth. This function assumes that the number for
    the previous depth has already been computed.

    :type nonterminal: nltk.grammar.Nonterminal
    :param nonterminal: The nonterminal from which generable strings are
        considered

    :type depth: int
    :param depth: The maximum depth of derivations considered

    :type table: dict
    :param table: A table containing the results obtained from this
        function for the previous depth. The format of this table is the
        same as the return value of make_table

    :type gr: CFG
    :param gr: The grammar whose generable strings are being considered

    :rtype: int
    :return: The number of strings generable by gr from nonterminal
        using derivations at most depth-many layers deep
    """
    nt_prods = gr.productions(lhs=nonterminal)
    total = 0
    for prod in nt_prods:
        total += table[(prod, depth)]
    return total


[docs]def count_production_depth(prod, depth, table, gr):
    """
    A helper function for make_table. This function computes the number
    of terminal strings generable using a derivation of at most a given
    depth that invokes a given production as its first step. This
    function assumes that the number for the previous depth has already
    been computed.

    :type prod: nltk.grammar.Production
    :param prod: The production invoked during the first step of the
        derivations considered

    :type depth: int
    :param depth: The maximum depth of derivations considered

    :type table: dict
    :param table: A table containing the results obtained from this
        function for the previous depth. The format of this table is the
        same as the return value of make_table

    :type gr: CFG
    :param gr: The grammar whose generable strings are being considered

    :rtype: int
    :return: The number of strings generable by gr using derivations at
        most depth-many layers deep using prod as their first step
    """
    result = 1
    for item in prod.rhs():
        if nltk.grammar.is_nonterminal(item):
            result *= count_nonterminal_depth(item, depth - 1, table, gr)
    return result


[docs]def choose_production(nt, depth, table, gr):
    """
    Randomly chooses a production with a given left-hand side nt. The
    probability of each production p is the proportion of strings
    generable from nt with derivations of at most a given depth that
    have derivations invoking p in the first step.

    :type nt: nltk.grammar.Nonterminal
    :param nt: The left-hand side of the production chosen

    :type depth: int
    :param depth: The maximum depth of derivations considered

    :type table: dict
    :param table: A table computed by make_table (see make_table and
        count_production_depth)

    :type gr: CFG
    :param gr: The context-free grammar from which the productions are
        drawn

    :rtype: nltk.grammar.Production
    :return: The chosen production
    """
    prods = gr.productions(lhs=nt)
    counts = []
    total = 0.
    for prod in prods:
        count = count_production_depth(prod, depth, table, gr)
        counts += [count]
        total += count
    counts = [i / total for i in counts]
    k = select_from_dist(counts)
    return prods[k]


[docs]def random_from_form(form, depth, table, nonterminals, gr):
    """
    Generates a random terminal string generable from a list of
    terminals and nonterminals using a derivation of at most a given
    depth.

    :type form: list
    :param form: A list of terminals and nonterminals, from which the
        return value is derived

    :type depth: int
    :param depth: The maximum depth of derivations considered

    :type table: dict
    :param table: A table computed by make_table (see make_table and
        count_production_depth)

    :type nonterminals: list
    :param nonterminals: Only nonterminals appearing in this list will
        be expanded

    :type gr: CFG
    :param gr: A context-free grammar

    :rtype: list
    :return: The generated terminal string, in sentence format
    """
    if depth <= 0:
        return form
    if all_not_in(form, nonterminals):
        return form
    else:
        result_lst = []
        for item in form:
            if item in nonterminals:
                prod = choose_production(item, depth, table, gr)
                result_lst += prod.rhs()
            else:
                result_lst += [item]
    return random_from_form(result_lst, depth - 1, table, nonterminals, gr)


if __name__ == "__main__":
    ######################################################
    # some test grammars

    dyck_grammar = CFG.fromstring("""
        S -> S S
        S -> '(' S ')' | '(' ')'
        S -> '[' S ']' | '[' ']'
    """)

    unambig_agreement_grammar = CFG.fromstring("""
        S -> NPsing "Auxsing"
        S -> NPplur "Auxplur"
        NP -> NPsing
        NP -> NPplur
        NPsing -> "Det" "Nsing" PP
        NPplur -> "Det" "Nplur" PP
        NPsing -> "Det" "Nsing" Relsing
        NPplur -> "Det" "Nplur" Relplur
        NPsing -> "Det" "Nsing"
        NPplur -> "Det" "Nplur"
        PP -> "Prep" NP
        Relsing -> "Rel" "Auxsing" VP
        Relsing -> Relobj
        Relplur -> "Rel" "Auxplur" VP
        Relplur -> Relobj
        Relobj -> "Rel" NPsing "Auxsing" "Vtrans"
        Relobj -> "Rel" NPplur "Auxplur" "Vtrans"
        VP -> "Vintrans"
        VP -> "Vtrans" NP
    """)

    exp_eval_grammar = CFG.fromstring("""
        S -> Strue | Sfalse
        Strue -> "T" "1" 
        Strue -> Strue Strue "and" "1"
        Strue -> Strue Strue "or" "1" | Strue Sfalse "or" "1"
        Strue -> Sfalse Strue "or" "1"
        Sfalse -> "F" "0"
        Sfalse -> Sfalse Sfalse "or" "0" 
        Sfalse -> Sfalse Sfalse "and" "0" | Sfalse Strue "and" "0"
        Sfalse -> Strue Sfalse "and" "0"
    """)

    # to save typing
    dgr = dyck_grammar
    uagr = unambig_agreement_grammar
    eegr = exp_eval_grammar

    # comparisons of table calculations and reported sample sizes

    # Note: the generate function from nltk uses a notion of
    # depth that is 1 more than that used above!

    # NB: the dyck_grammar is NOT unambiguous (S -> S S)
    dgr_table = make_table(6, dgr)
    print "dyck_grammar for 4 from count_nonterminal_depth"
    print count_nonterminal_depth(dgr.start(), 4, dgr_table, dgr)
    print "nltk generate: number of sentences for dyck grammar at depth = 5"
    print len(list(generate(dgr, depth=5)))
    print "The dyck_grammar is ambiguous!"

    # unambig_agreement_grammar
    # this agrees with the count for depth = 16 in generate
    uagr_table = make_table(15, uagr)
    print "unambig_agreement_grammar for 15 from count_nonterminal_depth"
    print count_nonterminal_depth(uagr.start(), 15, uagr_table, uagr)

    # exp_eval_grammar
    # this agrees with the count for depth = 6 in generate
    eegr_table = make_table(5, eegr)
    print "exp_eval_grammar for 5 from count_nonterminal_depth"
    print count_nonterminal_depth(eegr.start(), 5, eegr_table, eegr)

    print "number of nltk depth = 7 sentences from dyck_grammar"
    print count_nonterminal_depth(dgr.start(), 6, dgr_table, dgr)

    print "a random sentence nltk depth = 7 from dyck_grammar"
    print random_sentences(1, 6, dgr)

    sentences100 = random_sentences(100, 6, dgr)
    print "maximum length among 100 nltk depth = 7 sentences from dyck_grammar"
    print max([len(x) for x in sentences100])