
44
|
第
3
章
def __init__(self, infile=sys.stdin, separator='\t'):
super(BigramMapper, self).__init__(infile, separator)
self.stopwords = nltk.corpus.stopwords.words("english")
self.punctuation = string.punctuation
def exclude(self, token):
return token in self.punctuation or token in self.stopwords
def normalize(self, token):
return token.lower()
def tokenize(self, value):
for token in nltk.wordpunct_tokenize(value):
token = self.normalize(token)
if not self.exclude(token):
yield token
def map(self):
for value in self:
for bigram in nltk.bigrams(self.tokenize(value)):
self.counter("words") # 计算短语的总数
self.emit(bigram, 1)