标签:fp cont python text json pn 实验报告 data
选择一个股票代码(尾数与学号相同),编写爬虫程序,分析百度查询结果变动的情况。程序与分析结果写在实验结果栏。
import bs4
import re
import requests
import json
headers = {
'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
'Accept - Encoding':'gzip, deflate',
'Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5',
'Connection':'Keep-Alive',
'Host':'zhannei.baidu.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'
}
'''使用requests爬取网页文件'''
def baiduText(code, pn):
url = 'http://www.baidu.com/s?wd=' + code + '&pn='+str((pn-1)*10)
try:
r = requests.get(url, headers=headers,timeout=10)
r.raise_for_status()
r.encoding = 'utf-8' # linux utf-8
return r.text
except:
return "error"
'''使用bs4爬取文件内容'''
def parseSearch(text):
soup = bs4.BeautifulSoup(text,"html.parser")
tags = soup.find_all('span')
for tag in tags:
cont = tag.string
if (cont is not None and cont.startswith('百度为您找到相关结果约')):
cont = cont.lstrip('百度为您找到相关结果约')
cont = cont.rstrip('个')
cont = cont.replace(',','')
writefile('搜索结果为:'+str(cont))
print('搜索结果为:',cont)
def parseHtml(text,json_data):
soup = bs4.BeautifulSoup(text, "html.parser")
tags = soup.find_all('div', class_='result c-container ')
for tag in tags:
cont = str(tag .a)
name = tag.a.text
name = str(name)
rule = re.compile(r'ref="(.*?)"')
if (cont is not None):
data = rule.search(cont)
data = data.group()
data = data.lstrip('ref="',)
data = data.rstrip('"')
data = str(data)
writefile(name+'\n'+data)
json_data[name] = data
'''文件写入操作'''
def writefile(data,param = True):
with open('txt.txt', 'a+', encoding="utf-8") as fp:
fp.write(data)
fp.write('\n')
if param == False:
fp.close()
'''json文件写入'''
def writejson(json_data):
json_data = json.dumps(json_data, indent=4, ensure_ascii=False)
with open('txt.json', 'a+', encoding="utf-8") as fp:
fp.write(json_data)
'''main函数'''
def main():
json_data = {}
pn = 1
while(True):
text = baiduText('300014', pn)
if pn == 1:
parseSearch(text)
writefile("第"+str(pn)+"页的搜索结果为:")
parseHtml(text,json_data)
pn+=1
if(pn == 100):
print("爬取结束,请看txt.txt")
writefile("爬取结束", False)
break
writejson(json_data)
main()
import json
def compare():
with open('txt1.json', 'r', encoding="utf-8") as fp:
data1 = json.load(fp)
with open('txt2.json', 'r', encoding="utf-8") as fp:
data2 = json.load(fp)
#用于保存数据的临时字典
dict = {}
#匹配比较
for i in list(data1.keys()):
if i not in data2:
dict[i] = data1[i]
for i in list(data2.keys()):
if i not in data1:
dict[i] = data2[i]
#写入操作
json_data = json.dumps(dict, indent=4, ensure_ascii=False)
with open('txt.json', 'a+', encoding="utf-8") as fp:
fp.write(json_data)
fp.close()
print("over")
compare()
标签:fp,cont,python,text,json,pn,实验报告,data 来源: https://blog.csdn.net/qq_39383017/article/details/96426995
本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享; 2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关; 3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关; 4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除; 5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。