Wednesday, May 27, 2015

nlp17. Frequency Distribution of Trigrams in Python NLTK

The counts of different ngrams can be found by using nltk.probability.FreqDist. Creating an object of this class returns a Counter.


We use the same text as before, but another random sentence has been added.

# nlp17.py
from __future__ import print_function
from nltk.probability import FreqDist
from nltk.collocations import ngrams
from nltk.tokenize import PunktWordTokenizer
from nltk.corpus import stopwords
text = """
NLTK is a leading platform for building Python
programs to work with human language data. It's
considered the best by 2 out of 30 computational
linguists. NLTK leading platform of world.
"""
S = stopwords.words('english')
tok = PunktWordTokenizer().tokenize
words = [w for w in tok(text) if w not in S]
words = [w for w in words if len(w)>2]
A = ngrams(words,3)
A = list(A)

for a in A:
    print(a)
print()

fdist = FreqDist(A)
for k,v in fdist.items():
    print(k,v)
    
#    ('NLTK', 'leading', 'platform')
#    ('leading', 'platform', 'building')
#    ('platform', 'building', 'Python')
#    ('building', 'Python', 'programs')
#    ('Python', 'programs', 'work')
#    ('programs', 'work', 'human')
#    ('work', 'human', 'language')
#    ('human', 'language', 'data.')
#    ('language', 'data.', 'considered')
#    ('data.', 'considered', 'best')
#    ('considered', 'best', 'computational')
#    ('best', 'computational', 'linguists.')
#    ('computational', 'linguists.', 'NLTK')
#    ('linguists.', 'NLTK', 'leading')
#    ('NLTK', 'leading', 'platform')
#    ('leading', 'platform', 'world.')
#    
#    ('platform', 'building', 'Python') 1
#    ('computational', 'linguists.', 'NLTK') 1
#    ('considered', 'best', 'computational') 1
#    ('linguists.', 'NLTK', 'leading') 1
#    ('best', 'computational', 'linguists.') 1
#    ('leading', 'platform', 'building') 1
#    ('building', 'Python', 'programs') 1
#    ('data.', 'considered', 'best') 1
#    ('Python', 'programs', 'work') 1
#    ('NLTK', 'leading', 'platform') 2
#    ('leading', 'platform', 'world.') 1
#    ('language', 'data.', 'considered') 1
#    ('programs', 'work', 'human') 1
#    ('work', 'human', 'language') 1
#    ('human', 'language', 'data.') 1

No comments:

Post a Comment