《Python自然语言处理》第五章练习题答案

2021-03-23 20:00:34 阅读：197 来源： 互联网

标签：练习题 brown tagged Python tag words sents 自然语言 nltk

这章主要内容涉及分词、词性标注和标注器训练、字典使用。
因为中英文差别，所以在后面练习里尝试用中文数据来训练ngram标注器。

首先导包

import nltk
from nltk.corpus import brown
from nltk.book import *
import jieba
import matplotlib.pyplot as plt

#nltk词性标注无法消除歧义
text = nltk.word_tokenize('British Left Waffles on Falkland Islands')
nltk.pos_tag(text)

tag_words=brown.tagged_words()
for (word,tag) in tag_words:
    if word == 'contest':
        print(tag)
        break

nltk.pos_tag(nltk.word_tokenize('They wind back the clock,while we chase after the wind.'))

#dic中update将内容全部添加到d1中
d1 = {'a':1,'b':2,'c':3}
d2 = {'d':4,'f':5,'g':6}
d1.update(d2)
print(d1,d2)

text1.concordance('go')
text1.concordance('went')

import re
brown_tagged_sents = brown.tagged_sents()
brown_sents = brown.sents()
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
new_text = re.sub('[\,\.]',' ',"What needs to be clarified is that the fundamental purpose of China's development is to ensure that the Chinese people can live a better life and to benefit all humankind. Win-win cooperation is an important principle of China's development and a golden rule in China's external relations. China has no intention to interfere in the political system of the United States, nor challenge or replace its status and influence.In the past few years, due to Washington's irrational suppression of China's legitimate rights and interests, China-US relations have encountered unprecedented difficulties. This situation should not continue any longer. The only right way is to follow the principles of non-conflict, non-confrontation, mutual respect and win-win cooperation.")
word_tags = unigram_tagger.tag(new_text.split())
none_tag = []
for (word,tag) in word_tags:
    if tag==None:
        none_tag.append(word)
none_tag

没被标记的有拼写不规范的词、有连字符、新词

help(nltk.AffixTagger)

用法AffixTagger(train=None, model=None, affix_length=-3, min_stem_length=2, backoff=None, cutoff=0, verbose=False)

brown_sents=brown.sents(categories='news')
brown_tagged_sents=brown.tagged_sents(categories='news')
affixtagger=nltk.AffixTagger(train=brown_tagged_sents,affix_length=-3,min_stem_length=2)
affixtagger.tag(brown_sents[2007])


sents = brown.sents()
tag_sents = brown.tagged_sents()
baseline_tagger = nltk.BigramTagger(tag_sents)
baseline_tagger.evaluate(tag_sents)

sent = "They expressed their willingness to enhance cooperation or coordination in some specific areas. For instance, the two sides are committed to strengthening dialogue and cooperation in the field of climate change and will establish a joint working group on that subject. In the spirit of reciprocity and mutual benefit, the two sides will hold talks on facilitating activities of each other's diplomatic and consular missions and personnel, as well as on issues related to media reporters.".split()
baseline_tagger.tag(sent)

baseline_tagger.evaluate(brown_tagged_sents)

bigram用于新数据后得分会提高

print("date:%i-%i-%i"%(2021,3,21))
print("date:%s/%s/%s"%('2021','3','21'))

words = brown.words()
fd_dic={}
for w in words:
    w = w.lower()
    if w in fd_dic.keys():
        fd_dic[w] += 1
    else:
        fd_dic[w] = 1 
fd_dic

sorted(fd_dic.items(),key=lambda item:item[1],reverse=True)

words = brown.tagged_words(tagset='universal')
set(cont[1] for cont in words)

n_words = set(word for (word,tag) in words if tag=='NOUN')
n_dic={}
for w in brown.words():
    if len(w)>2:
        w = w[:-1]
        if w in n_words:
            if w in n_dic.keys():
                n_dic[w] += 1
            else:
                n_dic[w] = 1
sorted(n_dic.items(),key = lambda item:item[1],reverse=True)

cfd = nltk.ConditionalFreqDist(
(w.lower(),tag)for (w,tag) in words)
count_dic = {}
for word in cfd.conditions():
    count_dic[word] = len(cfd[word])
sorted(count_dic.items(),key = lambda item:item[1],reverse=True)

count_tag = {}
words = brown.tagged_words()
for (w,tag) in words:
    if tag in count_tag.keys():
        count_tag[tag] += 1
    else:
        count_tag[tag] = 1
sorted(count_tag.items(),key = lambda item:item[1],reverse=True)

words = brown.tagged_words(tagset='universal')
count_tags = {}
for i in range(len(words)):
    if words[i][1] == 'NOUN':
        back_tag = words[i+1][1]
        if back_tag in count_tags.keys():
            count_tags[back_tag] += 1
        else:
            count_tags[back_tag] = 1
sorted(count_tags.items(),key = lambda item:item[1],reverse=True)

fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
likely_tags = dict((word,cfd[word].max()) for word in brown.words())
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)

fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
likely_tags = dict((word,cfd[word].max()) for word in brown.words())
baseline_tagger = nltk.UnigramTagger(model=likely_tags,backoff=nltk.DefaultTagger('NN'))
baseline_tagger.evaluate(brown_tagged_sents)

for (k,v) in count_tags.items():
    print(k,":",(v/sum(count_tags.values()))*100)

cfd = nltk.ConditionalFreqDist(
(w.lower(),tag)for (w,tag) in words)
count_dic = {}
for word in cfd.conditions():
    if len(cfd[word])>1:
        count_dic[word] = len(cfd[word])
print((len(count_dic)/len(words))*100,"%")

words = brown.tagged_words()
w_li = []
for (w,t) in words:
    if t=='MD':
        w_li.append(w.lower())
w_li.sort()
print(set(w_li))

for i in range(len(words)):
    if words[i][1]=='P' and words[i+1][1]=='DET' and words[i+2][1]=='NN':
        print(words[i:i+3][0])

ws = []
for i in range(len(words)):
    if words[i][0].lower() in ('adore','love','like','prefer'):
        ws.append(words[i-1][0].lower())
set(ws)

brown_tagged_sents = brown.tagged_sents()
brown_sents = brown.sents()

train_full_size = int(len(brown_tagged_sents)*0.7)
train_sents = brown_tagged_sents[:train_full_size]
test_sents = brown_tagged_sents[train_full_size:]

tagger = nltk.UnigramTagger(train_sents)
tagger.evaluate(test_sents)

tagger = nltk.BigramTagger(train_sents)
tagger.evaluate(test_sents)

tagger = nltk.TrigramTagger(train_sents)
tagger.evaluate(test_sents)

多元标注器性能逐渐下降

25
加载人民日报2014语料

with open(r'E:\laptop\研一\2014_corpus.txt',encoding='utf8') as f:
    corpus = f.readlines()

#人民日报语料切分
tagged_sents = []
i = 1
for sent in corpus:
    if i<20000:
        tagged_sent = []
        for w in sent.split(' '):
            if w != '\n' and len(w.split('/'))==2:
                tagged_sent.append(tuple(w.split('/')))
        tagged_sents.append(tagged_sent)
        i+=1
    else:
        break

#一元标注器训练
size = int(len(tagged_sents)*0.7)
train = tagged_sents[:size]
test = tagged_sents[size:]
t0 = nltk.DefaultTagger('n')
t1 = nltk.UnigramTagger(train,backoff=t0)
t2 = nltk.BigramTagger(train,backoff=t1)
t2.evaluate(test)

t2.tag(jieba.lcut('PFR语料库是对人民日报1998年上半年的纯文本语料进行了词语切分和词性标注制作而成的，严格按照人民日报的日期、版序、文章顺序编排的。文章中的每个词语都带有词性标记。'))

t1 = nltk.UnigramTagger(train)

%matplotlib inline
def perform(data,test):
    baseline_tagger = nltk.UnigramTagger(train=data,backoff=nltk.DefaultTagger('n'))
    return baseline_tagger.evaluate(test)
def display():
    sizes = range(1,16)
    test = tagged_sents[-5000:]
    train_data = tagged_sents
    perfs = [perform(tagged_sents[:size*1000],test) for size in sizes]
    plt.plot(sizes,perfs,'-bo')
    plt.xlabel('data size')
    plt.ylabel('perform')
    plt.show()
display()

#抽出人民语料的标记
orl_sent = [[word for (word,tag) in sent if (word != None and tag!= None)] for sent in test]

test_tags = [tag for sent in orl_sent for (word,tag) in t2.tag(sent) if (word != None and tag!= None)]
gold_tags = [tag for (word,tag) in sent for sent in test if (word != None and tag!= None)]
nltk.ConfusionMatrix(gold_tags,test_tags)

%matplotlib inline
def perform(data,test):
    baseline_tagger = nltk.UnigramTagger(train=data,backoff=nltk.DefaultTagger('n'))
    return baseline_tagger.evaluate(test)
def display():
    sizes = range(1,16)
    test = tagged_sents[-5000:]
    train_data = tagged_sents
    perfs = [perform(tagged_sents[:size*1000],test) for size in sizes]
    plt.semilogx(sizes,perfs,'-bo')
    plt.xlabel('data size')
    plt.ylabel('perform')
    plt.show()
display()

size = int(len(tagged_sents)*0.7)
train = tagged_sents[:size]
test = tagged_sents[size:]
t0 = nltk.DefaultTagger('n')
t1 = nltk.UnigramTagger(train,backoff=t0)
t2 = nltk.BigramTagger(train,backoff=t1)
t2.evaluate(test)

t3 = nltk.BrillTaggerTrainer.train(train_sents=train,max_rules=200, min_score=2, min_acc=None)
t3.evaluate(test)

标签：练习题,brown,tagged,Python,tag,words,sents,自然语言,nltk
来源： https://blog.csdn.net/qq_44715621/article/details/115139313

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

《Python自然语言处理》第五章练习题答案