标签:get url self html 爬虫 爬取 book data 笔趣
# -*- coding: utf-8 -*- import requests from lxml import etree class BookSpider(object): def __init__(self): self.url = "http://www.jianlaixiaoshuo.com/" self.base_url = "http://www.jianlaixiaoshuo.com/" self.headers = { "Use_Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"} #请求网页内容 def get_html(self,url): html = requests.get(url,headers = self.headers).content.decode() return html #封装xpath def get_xpath(self,html,pattern): p = etree.HTML(html) result = p.xpath(pattern) return result #保存数据 def save_data(self, data): with open('剑来.txt','a',encoding='utf-8')as f: f.write(data) #下载数据 def down_load(self,url): html = self.get_html(self.url) # print(html) pattern1 = '//dl[@class="chapterlist"]/dd/a/@href' pattern2 = '//dl[@class="chapterlist"]/dd/a/text()' #获取每一章的链接地址 book_lists = self.get_xpath(html, pattern1) #获取每一章的章节名 book_name_lists = self.get_xpath(html, pattern2) print(book_lists) for book_name, url in zip(book_name_lists, book_lists): #完整的章节url地址 book_url = self.base_url + url book_html = self.get_html(book_url) #数据清洗 pattern = '//div[@id="BookText"]/p/text()' book_data = self.get_xpath(book_html, pattern) #将列表转换为str book_data = ''.join(book_data) book_data = book_data.replace('<p>','') book_data = book_data.replace('</p>', '') book_data = book_data.replace('<script type="text/javascript" src="/tb.js"></script>', '') book_data = book_data.replace('<br />', '') book_data = book_data.replace('—', '') book_data = book_data+'\n' book_text = book_name+'\n'+book_data print('正在下载',book_name) print(book_text) self.save_data(book_text) #运行程序 def run(self): self.down_load(self.url) if __name__ == "__main__": p = BookSpider() p.run()
标签:get,url,self,html,爬虫,爬取,book,data,笔趣 来源: https://www.cnblogs.com/maxxu11/p/12631126.html
本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享; 2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关; 3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关; 4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除; 5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。