标签:13 示例 url self guazi spider 爬虫 scrapy response
背景:瓜子二手车网站制定了一些反爬措施,针对该网站反爬措施,如何爬取我们想要的数据?
前置知识:该项目代码中,引用了urllib的一些方法,对url进行拆分合并,需要了解掌握,可以跳转过去先学习一下:https://blog.csdn.net/liyuanjinglyj/article/details/118697545
request_brands.py
将瓜子二手车平台中汽车的品牌爬取下来,存入到本地brands.txt文件中,后期作为参数传入Scrapy项目;
注意图示headers中的 client-time 和 verify-token 的两个值要根据自己打开二手车平台,查看 suggestion 异步请求填写。
1 import requests 2 3 url="https://mapi.guazi.com/car-source/carList/suggestion?osv=Unknown&city=65&field=1&platfromSource=wap&versionId=0.0.0.0&sourceFrom=wap&deviceId=2e464fb5-c575-4359-b5bc-001d2afe6a8a" 4 # 需要测试,测试出client-time和verify-token 5 headers={ 6 "client-time": "1656493058", 7 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.62 Safari/537.36", 8 "verify-token": "fafb20ed92e08be5adb117794e90bd03" 9 } 10 response = requests.get(url,headers=headers) 11 # 把数据储存到本地 12 with open("brands.txt",'w',encoding="utf-8") as f: 13 f.write(response.text)
Scrapy项目
items.py
首先明确我们要爬取的数据内容,即:车源号,车名字,排量,变速箱,价格;车源号是点开汽车详情后URL的参数
1 # Define here the models for your scraped items 2 # 3 # See documentation in: 4 # https://docs.scrapy.org/en/latest/topics/items.html 5 6 import scrapy 7 8 9 class ScrapyGuaziDemoItem(scrapy.Item): 10 # define the fields for your item here like: 11 # name = scrapy.Field() 12 # 车源号 13 car_id =scrapy.Field() 14 # 车名字 15 car_name = scrapy.Field() 16 # 排量 17 displacement = scrapy.Field() 18 # 变速箱 19 transmission = scrapy.Field() 20 # 价格 21 price = scrapy.Field()
guazi.py
start_requests :读取brands.txt文件中汽车品牌,作为参数传给变量url
parse :根据汽车车源号拼接url,作为参数传递给回调函数 parse_detail ;翻页获取数据,回调自身
parse_detail :获取洗车详情页中汽车的信息,赋值给Item中对应的变量
1 import scrapy 2 import json 3 from ..items import ScrapyGuaziDemoItem 4 from urllib.parse import urlparse,parse_qsl,urlunparse,unquote,urlencode 5 class GuaziSpider(scrapy.Spider): 6 name = 'guazi' 7 allowed_domains = ['guazi.com'] 8 start_urls = ['http://guazi.com/'] 9 10 11 def start_requests(self): 12 """发送列表页的请求""" 13 with open("../brands.txt", "r", encoding="utf-8") as f: 14 brands_data = f.read() 15 # 获取品牌列表 16 brands_list = json.loads(brands_data).get("data").get("common") 17 for brand in brands_list: 18 # 第一页列表页的URL 19 url = "https://mapi.guazi.com/car-source/carList/pcList?osv=Unknown&minor={}&sourceType=&ec_buy_car_list_ab=&location_city=&district_id=&tag=-1&license_date=&auto_type=&driving_type=&gearbox=&road_haul=&air_displacement=&emission=&car_color=&guobie=&bright_spot_config=&seat=&fuel_type=&order=7&priceRange=0,-1&tag_types=&diff_city=&intention_options=&initialPriceRange=&monthlyPriceRange=&transfer_num=&car_year=&carid_qigangshu=&carid_jinqixingshi=&cheliangjibie=&page=1&pageSize=20&city_filter=12&city=12&guazi_city=12&qpres=544352372349644800&platfromSource=wap&versionId=0.0.0.0&sourceFrom=wap&deviceId=c5f33bc0-08a6-438e-ae41-dcb6d88c2d2d".format(brand.get("value")) 20 yield scrapy.Request(url=url, callback=self.parse) 21 # 仅发送第一页请求,限定在一个品牌中的,方便我们讲课 22 break 23 24 25 def parse(self, response): 26 """第一页列表页请求的返回""" 27 data = response.json().get("data") 28 # 列表中每一条二手车的数据 29 guazi_items = data.get("postList") 30 for item in guazi_items: 31 detail_url="https://www.guazi.com/Detail?clueId={}".format(item.get("clue_id")) 32 yield scrapy.Request(url=detail_url,callback=self.parse_detail) 33 # break 34 # 当前页码 35 now_page=data.get("page") 36 # 总页码 37 total_page=data.get("totalPage") 38 # 判断是否超过一页 39 if total_page>1 and now_page ==1 : 40 # 从第二页开始发起请求 41 for page in range(2,total_page+1): 42 # 获取第一页的url 43 url=response.url 44 # 构造下一页 45 params={"page":str(page)} 46 # 解析url,使用urlparse返回了六个部分 47 url_parts=list(urlparse(url)) 48 # 通过parse_qsl转换为列表,keep_blank_values保留空字段 49 query=dict(parse_qsl(url_parts[4],keep_blank_values=True))#keep_blank_values=True:空值默认保留 50 # 更新页码 51 query.update(params) 52 # encode更新页码之后的url,放到了解析好的url第四个索引中 53 url_parts[4] = urlencode(query) 54 # 解码 55 page_url=unquote(urlunparse(url_parts)) 56 yield scrapy.Request(url=page_url,callback=self.parse) 57 58 def parse_detail(self,response): 59 guazi_info=ScrapyGuaziDemoItem() 60 # 车源号 61 guazi_info['car_id'] = response.xpath("//div[@class='right-carnumber']/text()").extract_first().strip() 62 # 车名字 63 guazi_info['car_name'] = response.xpath("//h1[@class='titlebox']/text()").extract_first().strip() 64 # 排量 65 guazi_info['displacement'] = response.xpath("//span[@class='assort-common']/text()").extract_first() 66 # 变速箱 67 guazi_info['transmission'] = response.xpath("//li[@class='assort-last']/span/text()").extract_first() 68 # 价格 69 guazi_info['price'] = response.xpath("//span[@class='price-num gzfont']/text()").extract_first() 70 yield guazi_info
middlewares.py
瓜子二手车网站增加了反爬虫功能,自定义了汽车价格的编码格式;因此我们新增 HandleDetail 类,用于反反爬;对返回结果Response中这些数字解密后,再传递给Scrapy Engine引擎;代码中这些数字,是会变动的,根据源代码实时调整
1 # Define here the models for your spider middleware 2 # 3 # See documentation in: 4 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 6 from scrapy import signals 7 import base64 8 # useful for handling different item types with a single interface 9 from itemadapter import is_item, ItemAdapter 10 11 class ScrapyGuaziDemoSpiderMiddleware: 12 # Not all methods need to be defined. If a method is not defined, 13 # scrapy acts as if the spider middleware does not modify the 14 # passed objects. 15 16 @classmethod 17 def from_crawler(cls, crawler): 18 # This method is used by Scrapy to create your spiders. 19 s = cls() 20 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 return s 22 23 def process_spider_input(self, response, spider): 24 # Called for each response that goes through the spider 25 # middleware and into the spider. 26 27 # Should return None or raise an exception. 28 return None 29 30 def process_spider_output(self, response, result, spider): 31 # Called with the results returned from the Spider, after 32 # it has processed the response. 33 34 # Must return an iterable of Request, or item objects. 35 for i in result: 36 yield i 37 38 def process_spider_exception(self, response, exception, spider): 39 # Called when a spider or process_spider_input() method 40 # (from other spider middleware) raises an exception. 41 42 # Should return either None or an iterable of Request or item objects. 43 pass 44 45 def process_start_requests(self, start_requests, spider): 46 # Called with the start requests of the spider, and works 47 # similarly to the process_spider_output() method, except 48 # that it doesn’t have a response associated. 49 50 # Must return only requests (not items). 51 for r in start_requests: 52 yield r 53 54 def spider_opened(self, spider): 55 spider.logger.info('Spider opened: %s' % spider.name) 56 57 58 class ScrapyGuaziDemoDownloaderMiddleware: 59 # Not all methods need to be defined. If a method is not defined, 60 # scrapy acts as if the downloader middleware does not modify the 61 # passed objects. 62 63 @classmethod 64 def from_crawler(cls, crawler): 65 # This method is used by Scrapy to create your spiders. 66 s = cls() 67 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 68 return s 69 70 def process_request(self, request, spider): 71 # Called for each request that goes through the downloader 72 # middleware. 73 74 # Must either: 75 # - return None: continue processing this request 76 # - or return a Response object 77 # - or return a Request object 78 # - or raise IgnoreRequest: process_exception() methods of 79 # installed downloader middleware will be called 80 return None 81 82 def process_response(self, request, response, spider): 83 # Called with the response returned from the downloader. 84 85 # Must either; 86 # - return a Response object 87 # - return a Request object 88 # - or raise IgnoreRequest 89 return response 90 91 def process_exception(self, request, exception, spider): 92 # Called when a download handler or a process_request() 93 # (from other downloader middleware) raises an exception. 94 95 # Must either: 96 # - return None: continue processing this exception 97 # - return a Response object: stops process_exception() chain 98 # - return a Request object: stops process_exception() chain 99 pass 100 101 def spider_opened(self, spider): 102 spider.logger.info('Spider opened: %s' % spider.name) 103 104 class HandleDetail(object): 105 def process_response(self,request,response,spider): 106 replace_value = { 107 "0": "", 108 "1": "", 109 "2": "", 110 "3": "", 111 "4": "", 112 "5": "", 113 "6": "", 114 "7": "", 115 "8": "", 116 "9": "" 117 } 118 # 获取返回值 119 text = response.text 120 121 for i,v in replace_value.items(): 122 if v in text: 123 text=text.replace(v,i) 124 # 把返回数据中的body替换为修改后的text 125 response = response.replace(body=text) 126 # 返回response 127 return response
pipelines.py
guazi.py文件中的 parse_detail 方法将数据传递给Item;piplines.py文件将Item中的数据入库;新增 MongoPipeline 类,编写数据入库代码;
数据库的用户名、密码、地址、端口等信息放在了settings.py文件。通过 crawler.settings.get() 获取配置文件中的信息
1 # Define your item pipelines here 2 # 3 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 6 7 # useful for handling different item types with a single interface 8 from itemadapter import ItemAdapter 9 import pymongo 10 11 class ScrapyGuaziDemoPipeline: 12 def process_item(self, item, spider): 13 return item 14 import pymongo 15 from itemadapter import ItemAdapter 16 17 class MongoPipeline: 18 # 保存集合的名称,即表名 19 collection_name = 'scrapy_items' 20 # mongodb初始化的 21 def __init__(self, mongo_uri, mongo_db,mongo_port,username,password): 22 self.mongo_uri = mongo_uri # 127.0.0.1 23 self.mongo_db = mongo_db # guazi 24 self.mongo_port = mongo_port # 端口号 25 self.username = username 26 self.password = password 27 28 @classmethod 29 def from_crawler(cls, crawler): 30 return cls( 31 mongo_uri=crawler.settings.get('MONGO_URI'), # 从settings.py中获取mongo地址 32 mongo_db=crawler.settings.get('MONGO_DATABASE', 'items'), # 数据库的名字 33 # 取端口 34 mongo_port = crawler.settings.get('MONGO_PORT'), 35 username = crawler.settings.get('USER_NAME'), 36 password = crawler.settings.get('USER_PASSWORD') 37 ) 38 39 def open_spider(self, spider): 40 self.client = pymongo.MongoClient(host=self.mongo_uri,port=self.mongo_port,username=self.username,password=self.password) 41 self.db = self.client[self.mongo_db] 42 43 def close_spider(self, spider): 44 self.client.close() 45 # 存储数据 46 def process_item(self, item, spider): 47 self.db[self.collection_name].insert_one(ItemAdapter(item).asdict()) 48 return item
settings.py
DOWNLOADER_MIDDLEWARES 启用middlewares.py中新增的数据处理类,处理好后传递给引擎; ITEM_PIPELINES 启用pipelines.py中新增的类,将管道打开,将item中的数据入库;设置参数和值
1 ITEM_PIPELINES = { 2 # 'scrapy_guazi_demo.pipelines.ScrapyGuaziDemoPipeline': 300, 3 'scrapy_guazi_demo.pipelines.MongoPipeline': 300, 4 } 5 6 7 DOWNLOADER_MIDDLEWARES = { 8 # 'scrapy_guazi_demo.middlewares.ScrapyGuaziDemoSpiderMiddleware': 543, 9 'scrapy_guazi_demo.middlewares.HandleDetail': 543, 10 11 } 12 13 14 MONGO_URI = '127.0.0.1' 15 # MONGODB_PORT = 1112 16 MONGODB_PORT = 27017 17 MONGO_DATABASE = 'guazi' 18 USER_NAME = 'admin' 19 USER_PASSWORD = '123456'
main.py
启动文件
1 from scrapy import cmdline 2 cmdline.execute("scrapy crawl guazi".split())
标签:13,示例,url,self,guazi,spider,爬虫,scrapy,response 来源: https://www.cnblogs.com/gltou/p/16423938.html
本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享; 2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关; 3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关; 4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除; 5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。