In [1]:
import sys
import re
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
import hunspell
import pyphen
from metaphone import doublemetaphone
from pattern.en import sentiment
from pattern.web import Twitter, plaintext

def get_vocabulary_from_theme(theme):
    vocabulary = []
    twitter = Twitter(language='en')
    metaphor_patterns = ["is like","feels like","is more important than",]
    for metaphor_source in metaphor_patterns:
        for tweet in twitter.search('"' + theme + metaphor_source + '"', cached=False):
            cleanedtweet = re.sub("RT","",plaintext(tweet.text).encode('ascii','ignore'))
            cleanedtweet = re.sub("@\w+","",cleanedtweet)
            vocabulary.append(cleanedtweet)
    return set(vocabulary)

#get related vocabulary from theme : Pattern
def get_disambiguation(phrase, word):
#word-sense disembiguation in Python
# https://github.com/alvations/pywsd
    disambiguated = lesk(context_sentence=phrase, ambiguous_word=word)
    return disambiguated.definition()

def find_synonyms(word):
    synonyms = []
    for ss in wn.synsets(word):
        synonyms.append( ss.name().split(".")[0].replace('_',' ') )
        for sim in ss.similar_tos():
            synonyms.append(sim.name().split(".")[0].replace('_',' '))
    return set(synonyms)
# http://stackoverflow.com/questions/5534926/to-find-synonyms-defintions-and-example-sentences-using-wordnet

def count_syllables(phrase):
#could syllables : Pyphen
    dic = pyphen.Pyphen(lang='en_US')
    sentence = dic.inserted(phrase)
    return len(re.findall(r"[\w']+", sentence))
#from nltk_contrib.readability.textanalyzer import syllables_en
#print syllables_en.count("potatoes ")
# http://image.slidesharecdn.com/nltk-110105180423-phpapp02/95/nltk-natural-language-processing-in-python-22-728.jpg?cb=1309726267
# using PyPhen instead

def words_rhymes(w1,w2):
#detect rhymes : Metaphone
    w1_a, w1_b = doublemetaphone(w1)
    w2_a, w2_b = doublemetaphone(w2)
    return w1_a[-1] == w2_a[-1]
    # does not actually use the 2nd value... could use metaphone simple instead?

def syllables_matching_words_from_list(list_of_words,word):
    result = []
    for l in list_of_words:
        if count_syllables(l) == count_syllables(word):
            result.append(l)
    return result

def rhyming_words_from_list(list_of_words,word):
    result = []
    for l in list_of_words:
        if words_rhymes(l,word):
            result.append(l)
    return result

def spellcheck(text):
#spell check : Hunspell
    potential_mistakes = {}
    hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')
    for word in re.findall(r"[\w']+", text):
        if not hobj.spell(word):
            potential_mistakes[word] = hobj.suggest(word)
    return potential_mistakes

def estimate_sentiment(text):
#sentiment analysis : Pattern sentiment()
    return sentiment(text)
    #useless so far, just a personal warpper
    # for meaning of values see http://www.clips.ua.ac.be/pages/pattern-en#sentiment

def replace_tag(text,tag,replacement):
    #replace the first found tag
    #print replace_tag("I want to eat an apple.", "NN", "orange")
    tokenized_sent = word_tokenize(text)
    pos_tagged = pos_tag(tokenized_sent)
    for t in pos_tagged:
        print t
        if tag == t[-1]:
            return re.sub(t[0],replacement,text) 
    return text

def respect_structure(text, structure):
    # text is not a string but a list of verses
    if not len(text) == len(structure):
        print 'failed number of lines'
        return False
    for i,line in enumerate(text):
        if not count_syllables(line) == structure[i][0]:
                    # should be able to handle None too when syllable count does not matter
            print 'failed syllables'
            print i
            return False
        if not structure[i][1] == None:
            if not words_rhymes(line,text[structure[i][1]]):
                                # redundant test at corresponding later line
                print 'failed rhyming'
                print i
                return False
    return True
    # poor return value, unable to tell what failed

#should become a grammar respecting function
def respect_grammar(text, grammar):
#for grammatical rules
    tokenized_sent = word_tokenize(text)
    pos_tagged = pos_tag(tokenized_sent)
    testinggrammar = []
    for t in pos_tagged:
        testinggrammar.append(t[-1]) 
    return testinggrammar == grammar
    # http://image.slidesharecdn.com/nltk-110105180423-phpapp02/95/nltk-natural-language-processing-in-python-22-728.jpg?cb=1309726267

if __name__ == '__main__' :
    testwords = []
    for arg in sys.argv[1:]:
        testwords.append(arg)

    themes = ['Valentines','Relationship']
    nicknames = ['gingersnap','Beaubear']
    names = ['Alison','Beau']
    themes += ['Thirteenth']
    
    # include nicknames as potential synonym for each name
    print 'generating for :'
    print 'themes' + str(themes)
    print 'names' + str(names) + ' aka ' + str(nicknames)
    print '-------------------------------------------------'
    
    print "test words from the command line:"
    if len(testwords) > 0:
        print testwords
#        print words_rhymes(testwords[0],testwords[1])
#        print words_rhymes(testwords[2],testwords[1])
#        print find_synonyms(testwords[0])
        for w in testwords:
            print w
#            print 'finding rhyming synonyms'
#            synonyms = find_synonyms(w)
#            print rhyming_words_from_list(synonyms,w)
#            print 'finding syllables matching synonyms'
#            print syllables_matching_words_from_list(synonyms,w)
#            print get_vocabulary_from_theme(w)

    structures = {}
    structures['haiku'] = [(5,None),(7,None),(5,None)]
    structures['rhymetest'] = [(2,1),(2,0)]
                                # redundant test
    # structures have :
    #   metres or syllables
    #   stanza or lines or verses
    #   rhyme_form (e.g. current line rhymes with line 3)
    #               Python indexing, line 1 is in fact index 0
    # WARNING : better overdefine multiples versions of one structure than handle strange open cases!
    #           no free poetry but N different lines with N different rhymes
    #           kaiku3 haiku5 haiku7 for the different number of verses
    #           etc
    myhaikutest = ['a b c d e','a b c d e f g','a b c d e']
    print 'Testing structure respect'
    print 'structures:'+ str(structures)
    print str(myhaikutest) + ' with structure ' + str(structures['haiku'])
    print respect_structure(myhaikutest,structures['haiku'])
    myrhymetest = ["patato","potato"]
    print str(myrhymetest) + ' with structure ' + str(structures['rhymetest'])
    print respect_structure(myrhymetest,structures['rhymetest'])
    
    vocabulary_from_theme = []
    print 'vocabulary_from_theme' + str(vocabulary_from_theme)

    sentence1 = 'I want to eat fruits.'
    test = respect_grammar(sentence1,['PRP', 'VBP', 'TO', 'VB', 'NNS', '.'])
    print 'The grammar has been respected' + str(test)
    
    found_rhyming_words = [] 
    print 'found_rhyming_words' + str(found_rhyming_words)
    
    #check coherence : word2vec with pretrained data https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
    #consider instead cheaper generic NLP : NTLK / spaCy
    generated_paragraph = 'I eat tapas in Spain and camembert in France. They were both delicious but especially the cheese.'
    estimated_coherence = 0
    print 'estimated_coherence:' + str(estimated_coherence)
    #import gensim
    #import os
    #model = gensim.models.word2vec.Word2Vec.load_word2vec_format(os.path.join(os.path.dirname(__file__), 'GoogleNews-vectors-negative300.bin'), binary=True)
    #model.most_similar('dog')
    # consider http://stackoverflow.com/questions/8897593/similarity-between-two-text-documents
    # if uses less memory
    
    #https://github.com/lekhakpadmanabh/Summarizer
    #summarize a text that is too long
    
    #Named Entity Recognition (NEs) in http://www.nltk.org/book/ch07.html
    
    exit()
    

generating for :
themes['Valentines', 'Relationship', 'Thirteenth']
names['Alison', 'Beau'] aka ['gingersnap', 'Beaubear']
-------------------------------------------------
test words from the command line:
['-f', '/home/fabien/.ipython/profile_nbserver/security/kernel-970602fc-fdb6-46ae-800b-09895affb1cb.json', '--profile-dir', '/home/fabien/.ipython/profile_nbserver']
-f
/home/fabien/.ipython/profile_nbserver/security/kernel-970602fc-fdb6-46ae-800b-09895affb1cb.json
--profile-dir
/home/fabien/.ipython/profile_nbserver
Testing structure respect
structures:{'rhymetest': [(2, 1), (2, 0)], 'haiku': [(5, None), (7, None), (5, None)]}
['a b c d e', 'a b c d e f g', 'a b c d e'] with structure [(5, None), (7, None), (5, None)]
True
['patato', 'potato'] with structure [(2, 1), (2, 0)]
True
vocabulary_from_theme[]
The grammar has been respectedTrue
found_rhyming_words[]
estimated_coherence:0


In [2]:
print words_rhymes('test','test')

True


In [None]:
print words_rhymes('test','testing')

False


In [1]:
print count_syllables('potato')

NameError: name 'count_syllables' is not defined

In [None]:
pyphen.language_fallback('en_US')

In [None]:
import pyphen


In [None]:
pyphen.language_fallback('en_US')

In [None]:
dic = pyphen.Pyphen(lang='en_US')