Sunday, May 31, 2015

nlp28. Reading a tagged corpus in Python NLTK

We use TaggedCorpusReader to read the corpus created in last program.


The methods word() and tagged_words() are used to get strings or tuples of the file contents.

# nlp28.py
from __future__ import print_function
from nltk.data import path
from nltk.corpus.reader import TaggedCorpusReader
path = path[0] + '/MyTest2Corpus'
reader = TaggedCorpusReader(path, '.*')
word1 = reader.words()
print("words1 =")
for i in range(len(word1)/2):
    print(word1[2*i],'\t',word1[2*i+1])
print()
tag_words1 = reader.tagged_words()
print("tag_words1 =")
for i in range(len(tag_words1)/2):
    print(tag_words1[2*i],'\t',tag_words1[2*i+1])

# words1 =
# Bush     Clinton
# Rubio    Clinton
# Paul     Sanders
#
# tag_words1 =
# (u'Bush', u'R')          (u'Clinton', u'D')
# (u'Rubio', u'R')         (u'Clinton', u'D')
# (u'Paul', u'R')          (u'Sanders', u'D')

No comments:

Post a Comment