from nltk.chunk import RegexpParser # RegexpParser from nltk import sent_tokenize,word_tokenize
# 写一个匹配名词的模式 # JJ adjective; NN noun, singular or mass # CC coordinating conjunction pattern = """ NP: {<JJ>*<NN>+} {<JJ>*<NN><CC>*<NN>+} """
# 定义组块分析器 chunker = RegexpParser(pattern)
# 分句 tokenized_sentence = nltk.sent_tokenize(text) # 分词 tokenized_words = [nltk.word_tokenize(sentence) for sentence in tokenized_sentence] # 词性标注 tagged_words = [nltk.pos_tag(word) for word in tokenized_words] # 识别组块 word_tree = [chunker.parse(word) for word in tagged_words]
# If you do not provide POS tag of the word, # lemmatizer will consider word as a noun and you may not get the result you expected lemmatizer.lemmatize('spoken') # Out: 'spoken'
... fd = FreqDist(l) fd.most_common(2) # Return a list of all samples that occur once fd.hapaxes() # Find the word occuring max number of times fd_w_humor.max() # Freq = Number of occurences / total number of words fd_w_humor.freq('the') # check how many times word 'pen' appeared fd_w_humor.get('pen') ...
# Conditional Frequency Distribution
# Use tabulate mathod to check distribution of modal words in different genre cfd = ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
cfd.tabulate(conditions=genres, samples=modals)
# 绘制分布图 l_names = ([('male',name[-1]) for name in names.words('male.txt')] + [('female',name[-1]) for name in names.words('female.txt')])