【shikaobang】 python爬虫脚本

2022-02-08 23:02:32 阅读：179 来源： 互联网

标签：name python import 爬虫 content url html news shikaobang

"""
事考帮更新url加密数字后，无法解码。只能用【<div class="title">相关推荐</div>】里面的链接来处理
解决办法：相关推荐是按题目顺序排列，以最后一个为起始网址，不断循环复制加密编码，起到原来的效果
"""
import pandas as pd
import urllib
import urllib2
from bs4 import BeautifulSoup
import codecs
import re


a1 = 101500 #需要自己修改起始值

urlname_list = []
url_name_start = u'/questionbank/5YmJvWgYm6' #填入查询到开始的urlname
url_name_end = u'/questionbank/G5mbgoM1aX' #填入查询到最后的urlname
urlname_list.append(url_name_start)
a = 1
b = 1
while True:
    url_name = "http://www.shikaobang.cn" + url_name_start
    user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/2011122 Ubuntu/10.10 (maverick) Firefox/2.5.1"
    request = urllib2.Request(url_name, headers={'User-Agent':user_agent})
    html = urllib2.urlopen(request)
    html_data = BeautifulSoup(html,"html.parser")
    if html_data.find(name='a') is None:
        urlname_list.pop()
        url_name_start = urlname_list[-1]
        print "网页抓取失败，此时网址为：" + url_name_start
        continue

    for m in html_data.find_all(href=re.compile("/questionbank/")) :
        if m['href'] == url_name_end:
            urlname_list.append(m['href'])
            break
        else:
            urlname_list.append(m['href'])
            a = a + 1
    url_name_start = urlname_list[-1]
    if url_name_end == url_name_start:
        break
    print u"网页抓取成功，此时网址为：" + url_name_start
    print u"查询结果共" + str(a) + u"条"
print u"最终查询结果共" + str(a) + u"条"


print u'开始爬取网页'
#爬取网页
import pandas as pd
import urllib
import urllib2
from bs4 import BeautifulSoup
import codecs

import time
time_start=time.time()
"""
修改题目对应网页数值

"""
a2 = a1

for i in urlname_list:
    try:
        url_name = "http://www.shikaobang.cn" + i
        user_agent = "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/2011122 Ubuntu/10.10 (maverick) Firefox/2.5.1"
        request = urllib2.Request(url_name, headers={'User-Agent':user_agent})
        html = urllib2.urlopen(request)
        f = codecs.open('html/sz_'+str(a1),'w')
        f.write(html.read())
        f.close()
        a1 = a1 + 1
    except:
        print i
        pass
    continue
print "下次使用该编码作为起始值:" + str((int(a1/100)+1)*100)
print "爬取网页结束，开始处理文本" 


# -*- coding: utf-8 -*-
def html_chuli(html):
    
    html_data = BeautifulSoup(html)
    
    t_miaosu = html_data.find(attrs={'name':'description'})['content'] #题目描述
    
    t_news_title = html_data.find_all(attrs={'class':'news-content-title'})
    t_news_typs = html_data.find_all(attrs={'class':'news-typs'})
    t_news_time = html_data.find_all(attrs={'class':'news-time'})

    tdata1 = html_data.find("div", attrs={'class':'main-content'})#抓取第一个框架
    if tdata1:
        t_leixing = tdata1.select('span')[0].string #题目类型
        t_content = tdata1.select('div.question-title')[0].string #题目内容 注：id是#；name是.
        t_xueze = tdata1.select('div.question-item') #题目所有选项

        x_ABCD = [] #选项ABCD
        x_content = [] #选项ABCD对应内容
        z_xueze = [] #正确选项

        for item in t_xueze:
            item_middle = item.get_text().split()
            x_ABCD.append(item_middle[:1]) 
            x_content.append(item_middle[1:]) 
    
        for item in tdata1.select('label.actives'):#选择
            z_xueze.append(item.string)   
        for item in tdata1.select('div.question-item.correct i'):#判断
            z_xueze.append(item.string)            
    
        return t_miaosu,t_leixing,t_content,x_ABCD,x_content,z_xueze,t_news_title,t_news_typs,t_news_time
    else:
        return '0'

#文本处理
import pandas as pd
import urllib
import urllib2
import re
import json
import random
from bs4 import BeautifulSoup
import codecs

"""
修改提取后对应文本编码
"""
for i in range(a2,a1):
    try:
        with open('html/sz_'+str(i), 'r') as f:
            s_1 = ""
            s_2 = ""
            t_n = ""
    
            contents = f.read().decode("utf-8", "ignore") #处理�
            t_miaosu,t_leixing,t_content,x_ABCD,x_content,z_xueze,t_news_title,t_news_typs,t_news_time = html_chuli(contents)

            for m in range(len(x_ABCD)):
                if x_ABCD[m][0]:
                    s1 = x_ABCD[m][0]
                else:
                    s1=""
                if x_content[m][0]:
                    s2 = x_content[m][0]
                else:
                    s2=""  
                
                s_1 = s_1 + s1 + ":" + s2 + "  "

            for n in range(len(z_xueze)):
                s_2 = s_2 + z_xueze[n].strip()
    
            for z in range(len(t_news_title)):
                if t_news_title[z]:
                    new1 = t_news_title[z].text
                else:
                    new1=""
                if t_news_typs[z]:
                    new2 = t_news_typs[z].text
                else:
                    new2=""
                if t_news_time[z]:
                    new3 = t_news_time[z].text
                else:
                    new3=""
                
                t_n = t_n + new1 + "|" + new2 + "|" + new3 + "&"
        
            if t_leixing is None:
                continue
                
            k1 = str(i) + "#" + t_miaosu.replace("\n", "") + "#" + t_leixing + "#" + t_content.replace(" ", "").replace("\n", "") + "#" + s_1.replace("\n", "") + "#" + s_2.replace("\n", "") + "#" + t_n.replace("\n", "")
            f1 = codecs.open(u'out/时政202011-20210325.txt','a',encoding="utf-8") #修改导出txt文件编号
            f1.write(k1 + "\n")
    except:
        f2 = codecs.open('out/fail_num.txt','a',encoding="utf-8")
        k2 = str(i)
        f2.write(k2 + "\n")
        print str(i) + u"号html文件导入失败！"
        f2.close()
        pass
    continue
               
f1.close()

print u"处理完毕！再次执行请修改“输出文件名”，并保存py文件，然后重新开始！！！"

　　此代码仅纪念作用，目前已不可用

标签：name,python,import,爬虫,content,url,html,news,shikaobang
来源： https://www.cnblogs.com/CQ-LQJ/p/15873219.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

【shikaobang】 python爬虫脚本