ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

爬虫-瓜子二手车

2019-09-24 19:42:21  阅读:359  来源: 互联网

标签:瓜子 二手车 car 22% list 爬虫 3A% 22ca 2C%


import requests,re,json,time,random
from lxml import etree
from fake_useragent import UserAgent
ua = UserAgent()

def request_html(url):
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': 'uuid=f50f4be3-0e11-4e0d-d7e2-ad282bb42715; clueSourceCode=10103000312%2300; ganji_uuid=9459142464668822758037; sessionid=4a3ab0f9-0dba-4152-a135-1216a0b42892; lg=1; _gl_tracker=%7B%22ca_source%22%3A%22-%22%2C%22ca_name%22%3A%22-%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_id%22%3A%22-%22%2C%22ca_s%22%3A%22self%22%2C%22ca_n%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22sid%22%3A58880429608%7D; user_city_id=73; cityDomain=yancheng; antipas=UL3U7i501f7530734474D9817; cainfo=%7B%22ca_a%22%3A%22-%22%2C%22ca_b%22%3A%22-%22%2C%22ca_s%22%3A%22pz_baidu%22%2C%22ca_n%22%3A%22tbmkbturl%22%2C%22ca_medium%22%3A%22-%22%2C%22ca_term%22%3A%22-%22%2C%22ca_content%22%3A%22%22%2C%22ca_campaign%22%3A%22%22%2C%22ca_kw%22%3A%22-%22%2C%22ca_i%22%3A%22-%22%2C%22scode%22%3A%2210103000312%22%2C%22keyword%22%3A%22-%22%2C%22ca_keywordid%22%3A%22-%22%2C%22ca_transid%22%3A%22%22%2C%22platform%22%3A%221%22%2C%22version%22%3A1%2C%22display_finance_flag%22%3A%22-%22%2C%22client_ab%22%3A%22-%22%2C%22guid%22%3A%22f50f4be3-0e11-4e0d-d7e2-ad282bb42715%22%2C%22ca_city%22%3A%22zz%22%2C%22sessionid%22%3A%224a3ab0f9-0dba-4152-a135-1216a0b42892%22%7D; preTime=%7B%22last%22%3A1568901802%2C%22this%22%3A1568881044%2C%22pre%22%3A1568881044%7D',
        'Host': 'www.guazi.com',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }
    response = requests.get(url,headers=headers).content.decode('utf-8')
    return response
    # print(response)
    # with open('guzi.html','w',encoding='utf-8') as fp:
    #     fp.write(response)
# 第一次请求 , 为了获取城市列表
url = 'https://www.guazi.com/yancheng/buy'
response = request_html(url)
# print(response)
lis1 = []
tree = etree.HTML(response)
city = re.compile(r'"domain":"(.*?)"',re.S)
city_list = city.findall(response)
# print(city_list)
# time.sleep(0.5)
for i in city_list:
    # 具体城市
    city_url = 'https://www.guazi.com/{}/buy'.format(i)
    # print(city_url)
    response2 = request_html(city_url)
    tree2 = etree.HTML(response2)
    # 当前城市中文名
    # now_city = tree.xpath('.//p[@class="city-curr"]/text()')[0].strip()
    # city_dic = {'城市': now_city}
    # lis.append(city_dic)
    brand_list = tree2.xpath('.//div[@class="dd-all clearfix js-brand js-option-hid-info"]//a/@href')
    # print(brand_list)
    # time.sleep(0.5)
    for j in brand_list:
        # 50页数据
        brand_url1 = 'https://www.guazi.com'+j
        # print(brand_url1)
        for k in range(1,2):
            # time.sleep(0.5)
            #https://www.guazi.com/yancheng/benz/o1/
            brand_url = brand_url1.rstrip('#bread')+'o{}/'.format(k)

            response3 = request_html(brand_url)
            tree3 = etree.HTML(response3)
            # 车辆详细信息
            car_list = tree3.xpath('.//ul[@class="carlist clearfix js-top"]/li')
            # print(len(car_list))
            # 获取数据
            lis2 = []
            for car in car_list:
                dic = {}
                # 1.图片链接
                car_img_list = car.xpath('.//img/@src')
                dic['car_img'] = car_img_list[0]
                # 2.名称
                car_title_list = car.xpath('.//h2[@class="t"]/text()')
                dic['car_title'] = car_title_list[0]
                # 3.年限,公里数,服务
                car_year_list = car.xpath('.//div[@class="t-i"]/text()')
                dic['car_year'] = car_year_list[0]
                dic['car_km'] = car_year_list[1]
                dic['car_sever'] = car_year_list[2]
                # 4.现价
                car_price_list = car.xpath('.//div[@class="t-price"]/p/text()')
                dic['car_price'] = car_price_list[0] + '万'
                # 5.原价
                car_oprice_list = car.xpath('.//div[@class="t-price"]/em/text()')
                if car_oprice_list:
                    dic['car_oprice'] = car_oprice_list[0]
                else:
                    dic['car_oprice'] = 'None'
                # 6.补贴价格
                car_bprice_list = car.xpath('.//em[@class="icon-sale"]/span/text()')
                if car_bprice_list:
                    dic['car_bprice'] = car_bprice_list[0] + '元'
                else:
                    dic['car_bprice'] = 'None'
                # 7.标签
                car_tag_list = car.xpath('.//div[@class="t-price"]/i/text()')
                car_tag = ','.join(car_tag_list)
                dic['car_tag'] = car_tag
                # 8.详情链接
                car_detail_url_list = car.xpath('./a/@hre吗f')
                dic['car_detail_url'] = car_detail_url_list[0]
                lis2.append(dic)
                lis1 = lis1+lis2
            with open('guazi1.json','a',encoding='utf-8') as f:
                json.dump(lis1,f,ensure_ascii=False)

标签:瓜子,二手车,car,22%,list,爬虫,3A%,22ca,2C%
来源: https://blog.csdn.net/weixin_42766128/article/details/101305476

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有