
朴素贝叶斯分类
|
57
make the subsequent tests work, though, we will have to fill in the skeleton for our
tokenizer module like so:
import re
class Tokenizer:
NULL = u'\u0000'
@staticmethod
def tokenize(string):
return re.findall("\w+", string.lower())
@staticmethod
def ngram(string, ngram):
tokens = Tokenizer.tokenize(string)
ngrams = []
for i in range(len(tokens)):
shift = i-ngram+1
padding = max(-shift,0)
first_idx = max(shift, 0)
last_idx = first_idx + ngram - padding
ngrams.append(Tokenizer.pad(tokens[first_idx:last_idx], padding))
return ngrams
@staticmethod
def pad(tokens, padding):
padded_tokens = []
for i in range(padding):
padded_tokens.append(Tokenizer.NULL) ...