Sunday, May 31, 2015

nlp30. Reading and Drawing a Chunked Corpus in Python NLTK

The chunked text is read using ChunkedCorpusReader.


Only the chunks have tags, and the others have None.


Three graphs are created. When one is closed by clicking on X, the second will pop up and so on. The draw method requires matplotlib should have been installed.

# nlp30.py
from __future__ import print_function
from nltk.data import path
from nltk.corpus.reader import ChunkedCorpusReader
path = path[0] + '/MyTest3Corpus'
reader = ChunkedCorpusReader(path, '.*')
words1 = reader.tagged_words()
for word in words1:
    print(word)
lines1 = reader.chunked_sents()
for l1 in lines1:
    l1.draw()
    
#    (u'Bush', u'R')
#    (u'Clinton', u'D')
#    (u'odds', None)
#    (u'are', None)
#    (u'1', None)
#    (u'in', None)
#    (u'2', None)
#    (u'Rubio', u'R')
#    (u'Clinton', u'D')
#    (u'odds', None)
#    (u'are', None)
#    (u'1', None)
#    (u'in', None)
#    (u'3', None)
#    (u'The', None)
#    (u'odds', None)
#    (u'of', None)
#    (u'Paul', u'R')
#    (u'Sanders', u'D')
#    (u'are', None)
#    (u'small', None)

Outputs:

nlp29. Creating chunked corpus in Python NLTK

The tagged chunks have to placed between [ and ].


Three lines are written with one chunk in each line about a possible machine learning outcome.

# nlp29.py
from __future__ import print_function
import os
from nltk.data import path
name = 'MyTest3Corpus'
name1 = 'Test3'
words1 = []
words1 += ['[Bush/R Clinton/D] odds are 1 in 2\n']
words1 += ['[Rubio/R Clinton/D] odds are 1 in 3\n']
words1 += ['The odds of [Paul/R Sanders/D] are small\n'] 
os.chdir(path[0])
os.mkdir(name)
os.chdir(name)
fout1 = open(name1,'w')
fout1.writelines(words1)
fout1.close()
print('word1 = \n',words1)

# word1 = 
#  ['[Bush/R Clinton/D] odds are 1 in 2\n',
#   '[Rubio/R Clinton/D] odds are 1 in 3\n',
#   'The odds of [Paul/R Sanders/D] are small\n']

nlp28. Reading a tagged corpus in Python NLTK

We use TaggedCorpusReader to read the corpus created in last program.


The methods word() and tagged_words() are used to get strings or tuples of the file contents.

# nlp28.py
from __future__ import print_function
from nltk.data import path
from nltk.corpus.reader import TaggedCorpusReader
path = path[0] + '/MyTest2Corpus'
reader = TaggedCorpusReader(path, '.*')
word1 = reader.words()
print("words1 =")
for i in range(len(word1)/2):
    print(word1[2*i],'\t',word1[2*i+1])
print()
tag_words1 = reader.tagged_words()
print("tag_words1 =")
for i in range(len(tag_words1)/2):
    print(tag_words1[2*i],'\t',tag_words1[2*i+1])

# words1 =
# Bush     Clinton
# Rubio    Clinton
# Paul     Sanders
#
# tag_words1 =
# (u'Bush', u'R')          (u'Clinton', u'D')
# (u'Rubio', u'R')         (u'Clinton', u'D')
# (u'Paul', u'R')          (u'Sanders', u'D')

nlp27. Creating tagged corpus in Python NLTK

We can tag each word in the corpus, putting the tag after the /.


Here a 6-word corpus is created with 6 tags.

# nlp27.py
from __future__ import print_function
import os
from nltk.data import path
name = 'MyTest2Corpus'
name1 = 'Test2'
word1 = ['Bush/R Clinton/D',
         'Rubio/R Clinton/D',
         'Paul/R Sanders/D']
os.chdir(path[0])
os.mkdir(name)
os.chdir(name)
word1 = [w+'\n' for w in word1]
fout1 = open(name1,'w')
fout1.writelines(word1)
fout1.close()
print('word1 = \n',word1)

# word1 = 
#  ['Bush/R Clinton/D\n', 'Rubio/R Clinton/D\n',
#   'Paul/R Sanders/D\n']

nlp26. Reading a corpus using WordListCorpusReader in Python NLTK

We use WordListCorpusReader to read a corpus, that was created in the last program. We import that Reader as WLCR.


We have to give the list of the files as the second parameter of WLCR.


The LazyCorpusLoader, and not WordListCorpusReader, is the main Reader used for corpus loading, and which we will go over that later.

# nlp26.py
from __future__ import print_function
from nltk.data import path
from nltk.corpus.reader import WordListCorpusReader as WLCR
path = path[0] + '/MyTestCorpus'
fileids = ['Test1','Test2']
reader = WLCR(path, fileids)
word1 = reader.words(fileids[0])
print("words1 =\n",word1)
word2 = reader.words(fileids[1])
print("words2 =\n",word2)

# words1 =
#  [u'One', u'Two', u'Five']
# words2 =
#  [u'Three', u'Four', u'Seven']

nlp25. Creating a Corpus in Python NLTK

NLTK includes some corpora. However, we can create our own.


We use Python to create two files 'Test1' and 'Test2' in in the folder MyTestCorpus. MyTestCorpus is a subfolder, that is created, inside the nltk_data folder.


This could also have done manually.

# nlp25.py
from __future__ import print_function
import os
from nltk.data import path
name = 'MyTestCorpus'
name1 = 'Test1'
word1 = ['One','Two','Five']
name2 = 'Test2'
word2 = ['Three', 'Four', 'Seven']
os.chdir(path[0])
os.mkdir(name)
os.chdir(name)
word1 = [w+'\n' for w in word1]
fout1 = open(name1,'w')
fout1.writelines(word1)
fout1.close()
word2 = [w+'\n' for w in word2]
fout2 = open(name2,'w')
fout2.writelines(word2)
fout2.close()
print('word1 = \n',word1)
print('word2 = \n',word2)

# word1 = 
#  ['One\n', 'Two\n', 'Five\n']
# word2 = 
#  ['Three\n', 'Four\n', 'Seven\n']

Saturday, May 30, 2015

nlp24. Wu-Palmer similarity in Python NLTK

Wu-Palmer similarity gives the distance of terms in the hypernym tree.


We find similarity with game, match, and stick. We note that match can have a meaning similar to either game or stick depending on which definition we use.

# nlp24.py
from __future__ import print_function
from nltk.corpus import wordnet
a = 'game.n.01'
b1 = 'match.n.01'
b2 = 'match.n.02'
c = 'stick.n.01'
A = wordnet.synset(a)
B1 = wordnet.synset(b1)
B2 = wordnet.synset(b2)
C = wordnet.synset(c)
print(a,A.definition())
print(b1,B1.definition())
print(b2,B2.definition())
print(c,C.definition())
print(a,b1,A.wup_similarity(B1))
print(a,b2,A.wup_similarity(B2))
print(a,c,A.wup_similarity(C))
print(b1,c,B1.wup_similarity(C))
print(b2,c,B2.wup_similarity(C))

# game.n.01 a contest with rules to determine a winner
# match.n.01 lighter consisting of a thin piece of wood or cardboard
# tipped with combustible chemical; ignites with friction
# match.n.02 a formal contest in which two or more persons or teams
# compete
# stick.n.01 an implement consisting of a length of wood
# game.n.01 match.n.01 0.125
# game.n.01 match.n.02 0.571428571429
# game.n.01 stick.n.01 0.133333333333
# match.n.01 stick.n.01 0.705882352941
# match.n.02 stick.n.01 0.133333333333

nlp23. WordNet Lemmatizer in Python NLTK

The nltk.stem.wordnet.WordNetLemmatizer object can be used to lemmatize and find a simple form for a word.


Note, if the part-of-speech is not indicated, the word is treated as a noun. We make this explicit, here, so we may use * to indicate the parameter should be treated as individual values and not as a tuple, in which case we have an error.

# nlp23.py
from __future__ import print_function
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
words = [('order','v'),'order',('orders','v')]
for word in words:
    if type(word) == str:
        word = (word,'n')
    lemma = lem.lemmatize(*word)
    print('For:',word, end='\t')
    print('Lemma =',lemma)

# For: ('order', 'v')     Lemma = order
# For: ('order', 'n')     Lemma = order
# For: ('orders', 'v')    Lemma = order

Thursday, May 28, 2015

nlp22. Named Entities in Python NLTK

Related to chunking, we can use nltk.chunk.ne_chunk to find named entities such as PERSON, ORGANIZATION, etc. in tokenized words. A line from Wikipedia is used, about their article on NLP.


It should be noted that we do not have to write nltk.chunk.ne_chunk, but instead we can write it as nltk.ne_chunk. However, I believe the longer format is better to show the structure of NLTK, and also the longer name is used only once in the import statements, such as from nltk.chunk import ne_chunk, rather than the shorthand from nltk import ne_chunk


Also note that only results which are not tuples are printed, and these correspond to Named Entities. (0,0) or (0,) are tuples but (0) is an integer.

# nlp22.py
from __future__ import print_function
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
text = """The history of NLP generally starts in the
1950s, although work can be found from earlier periods.
In 1950, Alan Turing published an article titled
"Computing Machinery and Intelligence" which proposed
what is now called the Turing test as a criterion of
intelligence.
"""
tag_text = [tag for tag in pos_tag(word_tokenize(text))]
result = ne_chunk(tag_text)

for r in result:
    if type(r)!=type((0,)):
        print(r)

#    (ORGANIZATION NLP/NNP)
#    (PERSON Alan/NNP Turing/NNP)
#    (ORGANIZATION Intelligence/NNP)
#    (GPE Turing/NNP)

nlp21. Chincking in Python NLTK

In the last program, we got three chunks: (1) DT/JJ/NN (2) DT/JJ/NN (3) DT/NN.


With chincking, we can remove results from the chunking result set. A trivial example could be to remove all entries with just the signature DT/JJ/NN, thus we only get the last Chunk.


To do this we just add the chincking regular expression between } { (opposite of the chunk opening and closing delimiter) that we want to remove on the second line of chunking regular expression. Note we have to use the starting and ending triple quotes for the mult-line string.

# nlp21.py
from __future__ import print_function
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser
text = "The big dog barked at the little cat. The cat ran away."
tag_text = [tag for tag in pos_tag(word_tokenize(text))]

chunk = """CHNK: {<DT>?<JJ>*<NN>}
 }<DT><JJ><NN>{
"""

cp = RegexpParser(chunk)
result = cp.parse(tag_text)
print(result)
result.draw()

#(S
#  The/DT
#  big/JJ
#  dog/NN
#  barked/VBD
#  at/IN
#  the/DT
#  little/JJ
#  cat/NN
#  ./.
#  (CHNK The/DT cat/NN)
#  ran/VBD
#  away/RB
#  ./.)

Output:

nlp20. Chunking in Python NLTK

We can break a text into chunks. This is the chunk structure we use: determiner (0 or 1), adjective (0 or more), noun.


The part of speech structure is sent to nltk.chunk.RegexpParser. We use the parse method of this class on our tagged text.


We get 3 chunks. We should see that the only required element of a chunk, used in this example, is a noun.

# nlp20.py
from __future__ import print_function
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser
text = "The big dog barked at the little cat. The cat ran away."
tag_text = [tag for tag in pos_tag(word_tokenize(text))]

chunk = "CHNK: {<DT>?<JJ>*<NN>}"
cp = RegexpParser(chunk)
result = cp.parse(tag_text)
print(result)
result.draw()

#(S
#  (CHNK The/DT big/JJ dog/NN)
#  barked/VBD
#  at/IN
#  (CHNK the/DT little/JJ cat/NN)
#  ./.
#  (CHNK The/DT cat/NN)
#  ran/VBD
#  away/RB
#  ./.)

Output:

nlp19. Part of Speech tagging in Python NLTK

By using nltk.tag.pos_tag you can tag a text with part of speech for each word token. Each token turns into 2-length tuple, the first element is the token, and the second element is the part of speech.


We also write the help for the tag using nltk.help. Once you are familiar with the various letters, you will rarely need this info.

# nlp19.py
from __future__ import print_function
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.help import upenn_tagset as hlp
text = "Python is a great language."

words = word_tokenize(text)
for tagged_word in pos_tag(words):
    print(tagged_word)
    hlp(tagged_word[1])
    
# ('Python', 'NNP')
# NNP: noun, proper, singular
#    Motown Venneboerger Czestochwa Ranzer Conchita
#    Trumplane Christos Oceanside Escobar Kreisler
#    Sawyer Cougar Yvette Ervin ODI Darryl CTCA
#    Shannon A.K.C. Meltex Liverpool ...
# ('is', 'VBZ')
# VBZ: verb, present tense, 3rd person singular
#    bases reconstructs marks mixes displeases
#    seals carps weaves snatches slumps stretches
#    authorizes smolders pictures emerges stockpiles
#    seduces fizzes uses bolsters slaps speaks pleads ...
# ('a', 'DT')
# DT: determiner
#    all an another any both del each either every
#    half la many much nary neither no some such
#    that the them these this those 
# ('great', 'JJ')
# JJ: adjective or numeral, ordinal
#    third ill-mannered pre-war regrettable oiled
#    calamitous first separable ectoplasmic
#    battery-powered participatory fourth
#    still-to-be-named multilingual
#    multi-disciplinary ...
# ('language', 'NN')
# NN: noun, common, singular or mass
#    common-carrier cabbage knuckle-duster Casino
#    afghan shed thermostat investment slide humour
#    falloff slick wind hyena override subhumanity
#    machinist ...
# ('.', '.')
# .: sentence terminator
#    . ! ?

nlp18. Stemmer in Python NLTK

We use PorterStemmer for stemming a bunch of words.


Since PorterStemmer is a class as seen from the beggining capital letter (the convention), we have to first make an object and then use a method. Since we will only use the method stem, the object is not stored but only the method.

# nlp18.py
from __future__ import print_function
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
text = """
cats catlike cat stemmer stemming stemmed stem
fishing fished fisher fish argue argued argues
arguing argument arguments
"""
PS = PorterStemmer().stem
for a in word_tokenize(text):
    print('%10s --> %10s' % (a,PS(a)) )

#      cats -->        cat
#   catlike -->     catlik
#       cat -->        cat
#   stemmer -->    stemmer
#  stemming -->       stem
#   stemmed -->       stem
#      stem -->       stem
#   fishing -->       fish
#    fished -->       fish
#    fisher -->     fisher
#      fish -->       fish
#     argue -->       argu
#    argued -->       argu
#    argues -->       argu
#   arguing -->       argu
#  argument -->   argument
# arguments -->   argument

Wednesday, May 27, 2015

nlp17. Frequency Distribution of Trigrams in Python NLTK

The counts of different ngrams can be found by using nltk.probability.FreqDist. Creating an object of this class returns a Counter.


We use the same text as before, but another random sentence has been added.

# nlp17.py
from __future__ import print_function
from nltk.probability import FreqDist
from nltk.collocations import ngrams
from nltk.tokenize import PunktWordTokenizer
from nltk.corpus import stopwords
text = """
NLTK is a leading platform for building Python
programs to work with human language data. It's
considered the best by 2 out of 30 computational
linguists. NLTK leading platform of world.
"""
S = stopwords.words('english')
tok = PunktWordTokenizer().tokenize
words = [w for w in tok(text) if w not in S]
words = [w for w in words if len(w)>2]
A = ngrams(words,3)
A = list(A)

for a in A:
    print(a)
print()

fdist = FreqDist(A)
for k,v in fdist.items():
    print(k,v)
    
#    ('NLTK', 'leading', 'platform')
#    ('leading', 'platform', 'building')
#    ('platform', 'building', 'Python')
#    ('building', 'Python', 'programs')
#    ('Python', 'programs', 'work')
#    ('programs', 'work', 'human')
#    ('work', 'human', 'language')
#    ('human', 'language', 'data.')
#    ('language', 'data.', 'considered')
#    ('data.', 'considered', 'best')
#    ('considered', 'best', 'computational')
#    ('best', 'computational', 'linguists.')
#    ('computational', 'linguists.', 'NLTK')
#    ('linguists.', 'NLTK', 'leading')
#    ('NLTK', 'leading', 'platform')
#    ('leading', 'platform', 'world.')
#    
#    ('platform', 'building', 'Python') 1
#    ('computational', 'linguists.', 'NLTK') 1
#    ('considered', 'best', 'computational') 1
#    ('linguists.', 'NLTK', 'leading') 1
#    ('best', 'computational', 'linguists.') 1
#    ('leading', 'platform', 'building') 1
#    ('building', 'Python', 'programs') 1
#    ('data.', 'considered', 'best') 1
#    ('Python', 'programs', 'work') 1
#    ('NLTK', 'leading', 'platform') 2
#    ('leading', 'platform', 'world.') 1
#    ('language', 'data.', 'considered') 1
#    ('programs', 'work', 'human') 1
#    ('work', 'human', 'language') 1
#    ('human', 'language', 'data.') 1

nlp16. Bigrams and Trigrams in Python NLTK

Bigrams are 2-contiguous word sequences. Trigrams are 3-contiguous words.


We can uses nltk.collocations.ngrams to create ngrams. Depending on the n parameter, we can get bigram, trigram, or any ngram. The function returns a generator object and it is possible so create a list, for example A = list(A).


The first line of text is from the nltk website. The second sentence is a random sentence. After tokenizing by words, we first filter for stop words, and then for any resulting word with length less than 3. Note the numbers have been removed.

# nlp16.py
from __future__ import print_function
from nltk.collocations import ngrams
from nltk.tokenize import PunktWordTokenizer
from nltk.corpus import stopwords
text = """
NLTK is a leading platform for building Python
programs to work with human language data. It's
considered the best by 2 out of 30 computational
linguists.
"""
S = stopwords.words('english')
tok = PunktWordTokenizer().tokenize
words = [w for w in tok(text) if w not in S]
words = [w for w in words if len(w)>2]
A = ngrams(words,2)
B = ngrams(words,3)
print("Bigrams")
for a in A:
    print(a)
print()
print("Trigrams")
for b in B:
    print(b)
    
#    Bigrams
#    ('NLTK', 'leading')
#    ('leading', 'platform')
#    ('platform', 'building')
#    ('building', 'Python')
#    ('Python', 'programs')
#    ('programs', 'work')
#    ('work', 'human')
#    ('human', 'language')
#    ('language', 'data.')
#    ('data.', 'considered')
#    ('considered', 'best')
#    ('best', 'computational')
#    ('computational', 'linguists.')
#    
#    Trigrams
#    ('NLTK', 'leading', 'platform')
#    ('leading', 'platform', 'building')
#    ('platform', 'building', 'Python')
#    ('building', 'Python', 'programs')
#    ('Python', 'programs', 'work')
#    ('programs', 'work', 'human')
#    ('work', 'human', 'language')
#    ('human', 'language', 'data.')
#    ('language', 'data.', 'considered')
#    ('data.', 'considered', 'best')
#    ('considered', 'best', 'computational')
#    ('best', 'computational', 'linguists.')

Tuesday, May 26, 2015

nlp15. Lemmas in Python NLTK

Besides Synset objects, we also have Lemma objects in WordNet. Lemma is the morphological form of the word.


Below, we have the lemmas of alive.

# nlp15.py
from __future__ import print_function, division
from nltk.corpus import wordnet
A = wordnet.lemmas('alive')
B = wordnet.synsets('alive')
for i in range(len(A)):
    print(A[i])
    print(B[i].lemma_names())

#    Lemma('alive.a.01.alive')
#    [u'alive', u'live']
#    Lemma('alive.s.02.alive')
#    [u'alive']
#    Lemma('animated.a.01.alive')
#    [u'animated', u'alive']
#    Lemma('alive.s.04.alive')
#    [u'alive']
#    Lemma('active.s.08.alive')
#    [u'active', u'alive']
#    Lemma('alert.s.03.alive')
#    [u'alert', u'alive', u'awake']
#    Lemma('alive.s.07.alive')
#    [u'alive', u'live']

nlp14. WordNet Examples and Hyponyms in Python NLTK

Among the methods of WordNet are examples and hyponyms.


examples() find sentences where the word is used, while hyponyms() finds the tree structure of similar words.

# nlp14.py
from __future__ import print_function, division
from nltk.corpus import wordnet
arr = "\t-->"
A = wordnet.synset('love.n.01')
for ex in A.examples():
    print('ex:'+ex)
for h in A.hyponyms():
    print(arr,h)

# ex:his love for his work
# ex:children need a lot of love
#        --> Synset('agape.n.01')
#        --> Synset('agape.n.02')
#        --> Synset('amorousness.n.01')
#        --> Synset('ardor.n.02')
#        --> Synset('benevolence.n.01')
#        --> Synset('devotion.n.01')
#        --> Synset('filial_love.n.01')
#        --> Synset('heartstrings.n.01')
#        --> Synset('lovingness.n.01')
#        --> Synset('loyalty.n.02')
#        --> Synset('puppy_love.n.01')
#        --> Synset('worship.n.02')

nlp13. WordNet in Python NLTK

WordNet provides a dictionary-like structure of Synset objects.


We can give it a string and optionally part of speech. If we don't give part of speech, it will return all the matching synsets of different parts of speech.


We use the definitions method, as we iterate over the objects returned.

# nlp13.py
from __future__ import print_function, division
from nltk.corpus import wordnet
arr = "\t-->"
A = wordnet.synsets('love')
for s in A:
    print(s)
    print(arr+s.definition())

# Synset('love.n.01')
#        -->a strong positive emotion of regard and affection
# Synset('love.n.02')
#        -->any object of warm affection or devotion; 
# Synset('beloved.n.01')
#        -->a beloved person; used as terms of endearment
# Synset('love.n.04')
#        -->a deep feeling of sexual desire and attraction
# Synset('love.n.05')
#        -->a score of zero in tennis or squash
# Synset('sexual_love.n.02')
#        -->sexual activities (often including sexual intercourse)
#           between two people
# Synset('love.v.01')
#        -->have a great affection or liking for
# Synset('love.v.02')
#        -->get pleasure from
# Synset('love.v.03')
#        -->be enamored or in love with
# Synset('sleep_together.v.01')
#        -->have sexual intercourse with

nlp12. Fileids in Python NLTK

We can access a specific text within a corpus by using a fileid.


The length of inaugural, that is, len(inaugural.words()) is 145735. However, by putting a fileid, in the call to the words method, we can select only a particular text.


The particular text we selected has a world length of, that is, len(inaugural.words('1789-Washington.txt')) is equal to 1538. We can use the fileids attribute of inaugural, or whatever the corpus happens to be, to get a list with the text names.


The first few words of the first inaugural is printed.

# nlp12.py
from __future__ import print_function, division
from nltk.corpus import inaugural
A = inaugural.fileids()
s = 2*' '
for a in A[:5]:
    print(s+a)
B = inaugural.words(A[0])
for b in B[:20]:
    print(b, end = s)

#  1789-Washington.txt
#  1793-Washington.txt
#  1797-Adams.txt
#  1801-Jefferson.txt
#  1805-Jefferson.txt
# Fellow  -  Citizens  of  the  Senate  and  of
# the  House  of  Representatives  :  Among  the
# vicissitudes  incident  to  life  no  

nlp11. RegexpTokenizer in Python NLTK

We can use RegexpTokenizer to write our own tokenizers.


Our sentences here are alternating numbers and words. The regular expression splits the numbers and words. It will consider a period (.) to be a number.


This only tokens selected have a period, digits, and letters. Thus ? or ! will not be selected.

# nlp11.py
from __future__ import print_function, division
from nltk.tokenize import RegexpTokenizer
A = "I'll3finish45my987project2.2today!3a"
tok = RegexpTokenizer("([a-zA-z']+|[0-9.]+)")
B = tok.tokenize(A)
for b in B: print('\t'+b)
#        I'll
#        3
#        finish
#        45
#        my
#        987
#        project
#        2.2
#        today
#        3
#        a

nlp10. PunktWordTokenizer and WordPunctTokenizer in Python NLTK

PunktWordTokenizer and WordPunctTokenizer will give different tokens for words such as I'll.


The same line is tokenized with different word tokenizers, and the resulting list is either B1,B2,B3, and of different lengths. To show until the last index of the max(length of B1,B2, B3), there are try-clauses to print only if an index exists. Since each clause is only 1-statement, we may put in the sole statement after the colon.

# nlp10.py
from __future__ import print_function, division
from nltk.tokenize import (PunktWordTokenizer,
                           WordPunctTokenizer, word_tokenize)
A = "I'll finish my project today."
PWT = PunktWordTokenizer()
WPT = WordPunctTokenizer()
w = word_tokenize
B1 = PWT.tokenize(A)
B2 = WPT.tokenize(A)
B3 = w(A)
L1,L2,L3 = len(B1),len(B2),len(B3)
print('B1\tB2\tB3')
for i in range(max(L1,L2,L3)):
    try: print(B1[i],end='\t')
    except: print(end='\t')
    try: print(B2[i],end='\t')
    except: print(end='\t')
    try: print(B3[i],end='\t')
    except: print(end='\t')
    print()

#    B1      B2      B3
#    I       I       I       
#    'll     '       'll     
#    finish  ll      finish  
#    my      finish  my      
#    project my      project 
#    today.  project today   
#            today   .       

Monday, May 25, 2015

nlp9. Lexical Dispersion Plot in Python NLTK

A lexical dispersion plot will plot occurences of words in a text.


Here, we select a subset of stopwords that occur more than 90 times and less than 100 times. There are 4 such words and they form the to_plot list, which is sent to the dispersion_plot function.

#nlp9.py
from __future__ import print_function, division
from nltk.corpus import stopwords
from nltk.book import text4
print("%s has a vocabulary of %d" % (text4,len(set(text4))))
words = stopwords.words('english')
to_plot = []
tot = 0
for word in words:
    count = text4.count(word)
    tot += count
    if 90<count<100:
        to_plot.append(word)
        print("count of %s is %d" % (word,count))
print("A total of %d stop words were used." % tot)
print("Total text length is",len(text4))
text4.dispersion_plot(to_plot)

# <Text: Inaugural Address Corpus> has a vocabulary of 9754
# count of between is 93
# count of through is 99
# count of out is 91
# count of some is 91
# A total of 64854 stop words were used.
# Total text length is 145735

Output:

nlp8. Shortening stop word list in Python NLTK

In the previous program, the stop list S contained 127 words.


We remove the words 'then' and 'now' using set operations so its new size is 125. Even though S changed from the list to set, the rest of the program did not have to change.

# nlp8.py
from __future__ import print_function, division
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
lines = """Dr. Brown gave a speech. I'd wish I knew then
what I know now. Finally, he praised Python! At 8 o'clock,
he went home.""" 
S = stopwords.words("english")
nS = ['then','now']
S = set(S)-set(nS)
t = '\t'
A = word_tokenize(lines.lower())
for a in A:
    if a not in S:
        print(t,a)

#         dr.
#         brown
#         gave
#         speech
#         .
#         'd
#         wish
#         knew
#         then
#         know
#         now
#         .
#         finally
#         ,
#         praised
#         python
#         !
#         8
#         o'clock
#         ,
#         went
#         home
#         .

nlp7. Stop word removal in Python NLTK

The function nltk.corpus.stopwords.words gets a list of 127 stop words which usually do not add much to the meaning of sentences. However, it is always possible to find exceptions.


The list is put in S. If you are getting too much filtering, you should try to shorten the stoplist.

# nlp7.py
from __future__ import print_function, division
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
lines = """Dr. Brown gave a speech. I'd wish I knew then
what I know now. Finally, he praised Python! At 8 o'clock,
he went home.""" 
S = stopwords.words("english")
t = '\t'
A = word_tokenize(lines.lower())
for a in A:
    if a not in S:
        print(t,a)

#    dr.
#    brown
#    gave
#    speech
#    .
#    'd
#    wish
#    knew
#    know
#    .
#    finally
#    ,
#    praised
#    python
#    !
#    8
#    o'clock
#    ,
#    went
#    home
#    .

nlp6. Treebank tokenizer in Python NLTK

The program below works the same as last, since Treebank Tokenizer is the default word tokenizer.


Parts of the DocString is printed. We import nltk.tokenize.TreebankWordTokenizer as the alias TWT.


Instead of using the regular expressions in Penn Treebank, we may also create new rules.

# nlp6.py
from __future__ import print_function, division
from nltk.tokenize import TreebankWordTokenizer as TWT
lines = """Dr. Brown gave a speech. I'd wish I knew then
what I know now. Finally, he praised Python! At 8 o'clock,
he went home.""" 

t = '\t'
A = TWT()
B = A.tokenize(lines)
print("DocString")
for i in TWT.__doc__.split('\n')[1:4]:
    print(i)
for i,b in enumerate(B):
    print(t,i,b)

#DocString
#    The Treebank tokenizer uses regular expressions to tokenize
# text as in Penn Treebank. This is the method that is invoked by
# ``word_tokenize()``.  It assumes that the text has already been
# segmented into sentences, e.g. using ``sent_tokenize()``.
#         0 Dr.
#         1 Brown
#         2 gave
#         3 a
#         4 speech.
#         5 I
#         6 'd
#         7 wish
#         8 I
#         9 knew
#         10 then
#         11 what
#         12 I
#         13 know
#         14 now.
#         15 Finally
#         16 ,
#         17 he
#         18 praised
#         19 Python
#         20 !
#         21 At
#         22 8
#         23 o'clock
#         24 ,
#         25 he
#         26 went
#         27 home
#         28 .

nlp5. Word Tokenization in Python NLTK

Usually, we want a text to be broken into words, which is done by nltk.tokenize.word_tokenize.


As we can see from the DocString, it uses sentence tokenizing as well.


Instead of using enumerate, we can always iterate over the indices.

# nlp5.py
from __future__ import print_function, division
from nltk.tokenize import word_tokenize
lines = """This is the first sentence. Dr. Brown gave a speech.
Finally, he praised Python! At 8 o'clock, he went home.""" 

A = word_tokenize(lines)
print("DocString for %s:\n%s" % ("word_tokenize",
                                 word_tokenize.__doc__.strip()))
for i in range(len(A)):
    print(i,A[i])

#    DocString for word_tokenize:
#    Return a tokenized copy of *text*,
#        using NLTK's recommended word tokenizer
#        (currently :class:`.TreebankWordTokenizer`
#        along with :class:`.PunktSentenceTokenizer`).
#    0 This
#    1 is
#    2 the
#    3 first
#    4 sentence
#    5 .
#    6 Dr.
#    7 Brown
#    8 gave
#    9 a
#    10 speech
#    11 .
#    12 Finally
#    13 ,
#    14 he
#    15 praised
#    16 Python
#    17 !
#    18 At
#    19 8
#    20 o'clock
#    21 ,
#    22 he
#    23 went
#    24 home
#    25 .

nlp4. Directly loading a tokenizer in Python NLTK

This program does the same thing as the last.


Now we explicity load our tokenizer. It has to be found in ntlk_data folder. This load was implicit in the last program.


We also print the DocString, after removing whitespace characters.

# nlp4.py
from __future__ import print_function, division
from nltk.data import load
lines = """This is the first sentence. Dr. Brown gave a speech.
Finally, he praised Python! At 8 o'clock, he went home.""" 

tok = load("tokenizers/punkt/english.pickle")
print("DocString:\n",tok.tokenize.__doc__.strip())
A = tok.tokenize(lines)

print('type(A)=',type(A))
for i,j in enumerate(A):
    print(i,': ',j)

# DocString:
#  Given a text, returns a list of the sentences in that text.
# type(A)= <type 'list'>
# 0 :  This is the first sentence.
# 1 :  Dr. Brown gave a speech.
# 2 :  Finally, he praised Python!
# 3 :  At 8 o'clock, he went home.

Sunday, May 24, 2015

nlp3. Sentence tokenization in Python NLTK

A text has to be broken into sentences for further processing.


We can always write a bunch of rules, or we can use nltk.tokenize.sent_tokenize.

# nlp3.py
from __future__ import print_function, division
from nltk.tokenize import sent_tokenize
lines = """This is the first sentence. Dr. Brown gave a speech.
Finally, he praised Python! At 8 o'clock, he went home.""" 

A = sent_tokenize(lines)
print('type(A)=',type(A))
for i,j in enumerate(A):
    print(i,': ',j)

#    type(A)= <type 'list'>
#    0 :  This is the first sentence.
#    1 :  Dr. Brown gave a speech.
#    2 :  Finally, he praised Python!
#    3 :  At 8 o'clock, he went home.

nlp2. Concordance in Python NLTK

Concordance gives the context of some text inside a corpus.


Here, we iterate over three strings in a Python list and see what is contained in Wall Street Journal for those entries.


Unlike the count method, which returns the integer, the concordance method returns None, but just prints its results.

# nlp2.py
from __future__ import print_function, division
from nltk.book import text7
print('text7 =',text7)
print('text 7 length =',len(text7))
St = ["Indonesia","Singapore","Malaysia"]
for st in St:
    n = text7.count(st)
    print("The string %s ocurrs %d times" % (st,n))
    print("The occurences:")
    text7.concordance(st,50)
    
#    text7 = <Text: Wall Street Journal>
#    text 7 length = 100676
#    The string Indonesia ocurrs 2 times
#    The occurences:
#    Displaying 2 of 2 matches:
#     and export them to Indonesia . `` The effect wil
#    aysia , Singapore , Indonesia , the Philippines a
#    The string Singapore ocurrs 4 times
#    The occurences:
#    Displaying 4 of 4 matches:
#     tobacco smoke . In Singapore , a new law require
#    cial said 0 *T*-1 . Singapore already bans smokin
#    ailand , Malaysia , Singapore , Indonesia , the P
#    es closed higher in Singapore , Taipei and Wellin
#    The string Malaysia ocurrs 6 times
#    The occurences:
#    Displaying 6 of 6 matches:
#    ing slow progress in Malaysia . '' She did n't ela
#    eocassette piracy in Malaysia and disregard for U.
#    ood restaurants . In Malaysia , Siti Zaharah Sulai
#    such as Thailand and Malaysia , the investment wil
#    assemble the sets in Malaysia and export them to I
#    ations -- Thailand , Malaysia , Singapore , Indone

Thursday, May 21, 2015

nlp1. Reading a text in Python NLTK

The NLTK module in Python can be used to load a text, or corpus. In nltk_data folder, you can find the included texts. This assumes all the data files have been downloaded to the computer using nltk.download().


Here Shakespeare’s Julius Caesar is read as a raw string. We may also use the xml loader which will allow parsing the tree, for example the <LINE> elements.


The <LINE> elements are extracted using regular expressions. Only a subset of the lines are printed; those with the word 'Pompey'.

# nlp1.py
from __future__ import print_function, division
from nltk.corpus import shakespeare
import re
sp = " " * 2
jc = shakespeare.raw("j_caesar.xml")
jc_lines = re.findall(r"<LINE>.+</LINE>", jc)
for line in jc_lines:
    lin = line[6:-7]
    if lin.count("Pompey"):
        print(sp+lin)
        
#  Knew you not Pompey? Many a time and oft
#  To see great Pompey pass the streets of Rome:
#  That comes in triumph over Pompey's blood? Be gone!
#  In Pompey's porch: for now, this fearful night,
#  Repair to Pompey's porch, where you shall find us.
#  That done, repair to Pompey's theatre.
#  Who rated him for speaking well of Pompey:
#  That now on Pompey's basis lies along
#  Even at the base of Pompey's statua,
#  As Pompey was, am I compell'd to set

Sunday, May 10, 2015

ML1. K-Nearest Neighbor in Python

K-Nearest Neighbor is a supervised lazy learning technique.


The Iris dataset is used, with 150 instances, 4 features and 3 classes. The first 50 observations (rows) correspond to class 0, next 50 rows to class 1 and last 50 rows to class 2. The program prints the class names.


10-fold cross validation is used. Thus 150/10 = 15 instances are used for testing, and the rest for training. This is done 10 times, each time with a new set of indices. The KFold function has the shuffle parameter set to True so each test/training will have samples from all 3 classes.


The accuracy_score function is used to find the fraction of correctly labelled test values. Since there are 135 training labels, we thus find 135 distances in 4D space during each train-test iteration.

# ML1.py
from __future__ import print_function, division
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# Loading data (150,4)
data = load_iris()
x = data.data
y = data.target
print('The three classes are',data.target_names)

# Use 5 nearest neighbors
classifier = KNeighborsClassifier(n_neighbors=5)

# Running 10 tests using 10-fold cross validataon
test = set()
acc = []
kf = KFold(len(x), n_folds=10, shuffle=True)
for trn,tst in kf:
    x_train = x[trn]
    y_train = y[trn]
    print('length of x_train:',len(x_train))
    classifier.fit(x_train, y_train)
    x_test = x[tst]
    y_test = y[tst]
    test = test.intersection(tst)
    print('length of x_test:',len(x_test))
    print('tst:',tst)
    pred = classifier.predict(x_test)
    acc.append(accuracy_score(y_test,pred))

# Accuracy
print('Result: {}'.format(sum(acc)/len(acc)))
print('length of test: {}'.format(len(test)))

#The three classes are ['setosa' 'versicolor' 'virginica']
#length of x_train: 135
#length of x_test: 15
#tst: [ 11  20  37  42  58  88  94  95  99 101 117 121 132 136 146]
#length of x_train: 135
#length of x_test: 15
#tst: [  0  13  19  26  47  64  76  86  97  98 104 105 120 133 143]
#length of x_train: 135
#length of x_test: 15
#tst: [ 12  18  24  27  30  33  35  38  48  51  55  60 106 122 144]
#length of x_train: 135
#length of x_test: 15
#tst: [  5  32  45  52  65  66  81  83  90 102 116 131 137 139 148]
#length of x_train: 135
#length of x_test: 15
#tst: [  3   4  17  23  29  31  40  41  49  79  85  87 109 114 145]
#length of x_train: 135
#length of x_test: 15
#tst: [  1   2  54  57  61  80  89  96 113 115 118 127 128 134 141]
#length of x_train: 135
#length of x_test: 15
#tst: [  7   8  10  15  16  71  74  82 125 129 130 135 140 142 149]
#length of x_train: 135
#length of x_test: 15
#tst: [  9  14  53  56  68  69  73  75  77  91 100 103 107 110 111]
#length of x_train: 135
#length of x_test: 15
#tst: [  6  21  25  28  34  44  46  62  63  70  92 119 126 138 147]
#length of x_train: 135
#length of x_test: 15
#tst: [ 22  36  39  43  50  59  67  72  78  84  93 108 112 123 124]
#Result: 0.973333333333
#length of test: 0