import requests
import re
import sqlite3

def get_page(url):#返回url的HTML源码
	response = requests.get(url)
	if response.status_code==200:
	  return response.text
	else:
	  return '失败'

def parse_html(html):#解析参数html的HTML源码
    #正则表达式匹配动漫排名，播放链接，名称，人气值
	pattern=re.compile('<li class="row mb-1 rank-item">.*?rank.*?>(\d+)</span>.*?href="(.*?)".*?age-text-blue small.*?>(.*?)</span>.*?col-3 small text-truncate.*?>(.*?)</span>', re.S)
	result=re.findall(pattern, html)
	return result

def link_html():#抓取来源html
    html = get_page('https://m.agefans.org/rank')
    result1 = parse_html(html)
    return result1

def save_db():#将爬取到的数据存储到sqlite中
    result1=link_html()
    con=sqlite3.connect(r'...\AGE.db')
    con.execute("""DROP TABLE data""")
    con.execute("create table data (rank primary key,link,title,rating)")
    cur=con.cursor()
    cur.executemany("insert into data(rank,link,title,rating) values(?,?,?,?)",result1)
    con.commit()
    cur.close()
    con.close()

if __name__ == '__main__':
    save_db()

1.1运行结果

用DB Browser for SQLite查看AGE.db爬取的内容（展示1~35条信息）

1.2爬虫难点

1.2.1编写正则表达式:

排名：.*?rank.*?>(\d+)</span>

链接：.*?href="(.*?)"

名称：.*?age-text-blue small.*?>(.*?)</span>

热度：.*?col-3 small text-truncate.*?>(.*?)</span>

每个网站对应的页面元素组成各不相同，需要根据实际所需要爬取的网站写出对应的正则表达式。CTRL+SHIFT+I进入检查页面查看元素。

1.3爬虫中的不足

1.3.1抓取的动漫播放链接不够完整

链接link因为网站的<a>标签内的href不是绝对路径而是相对路径，爬取出来的链接并不能直接跳转，没找到能够将相对路径换成绝对路径的方法

2.GUI展现爬虫内容

import tkinter
import tkinter.messagebox
from tkinter.messagebox import *
import tkinter.ttk
import tkinter as tk
import sqlite3
from PIL import ImageTk, Image
from tkinter import ttk
import pymysql
win=tkinter.Tk()

#页面大小
win.geometry("1390x750")
win.title('AGE排行榜')

#标题
label=tkinter.Label(win,compound = 'center',text='AGE动漫排行榜',font=('黑体',40),fg='#db7093',bg='#add8e6',width='500')
label.pack()

#背景图片
imgpath = (r'...\1.jpg')#背景图片路径
img = Image.open(imgpath)
canvas = tk.Canvas(win, width=2500, height=1000, bd=0)
photo = ImageTk.PhotoImage(img)
canvas.create_image(690, 280, image=photo)
canvas.pack()

from tkinter import *
Label(win, text="关键字查询：",bg='#add8e6',font=('黑体',15)).place(x=500, y=80, width=120, height=25)
selecttitle = StringVar()
Entry(win, textvariable=selecttitle).place(x=650, y=80, width=300, height=25)


# 数据库位置
database = (r'...\AGE.db')

# 显示函数
def showAllInfo():
    # 将之前显示的内容删除
    x = dataTreeview.get_children()
    for item in x:
        dataTreeview.delete(item)
    # 连接数据库
    con = sqlite3.connect(database)
    cur = con.cursor()
    cur.execute("select * from data")
    lst = cur.fetchall()
    for item in lst:
        dataTreeview.insert("", 100, text="line1", values=item)
    cur.close()
    con.close()

#按标题查询
def showTitle():
   if selecttitle.get() == "":
       showerror(title='提示', message='输入不能为空')
   else:
       x = dataTreeview.get_children()
       for item in x:
           dataTreeview.delete(item)
       con = sqlite3.connect(database)
       cur = con.cursor()
       content="'%"+selecttitle.get()+"%'"	#进行模糊查询
       cur.execute("select * from data where title like "+content)
       lst = cur.fetchall()
       if len(lst) == 0:  #判断如果查询不到则提示查询不到窗口
           showerror(title='提示', message='此动漫暂未上榜，或检查输入信息是否正确')
       else:#否则显示查询几条记录窗口
           showinfo(title='提示', message='查询到'+str(len(lst))+"条数据")
           for item in lst:
               dataTreeview.insert("", 100, text="line1", values=item)
       cur.close()
       con.close()

tkinter.Button(win,text='查询全部',width=40,command=showAllInfo,font=(12)).place(x=800, y=125, width=120, height=30)
Button(win, text="按标题查询", command=showTitle,font=(12)).place(x=550, y=125, width=120, height=30)

#列表sqlite数据
dataTreeview = ttk.Treeview(win, show='headings', column=('rank','link', 'title', 'rating'))
dataTreeview.column('rank', width=2, anchor="center")
dataTreeview.column('link', width=20, anchor="center")
dataTreeview.column('title', width=350, anchor="center")
dataTreeview.column('rating', width=15, anchor="center")

dataTreeview.heading('rank', text='排名')
dataTreeview.heading('link', text='链接')
dataTreeview.heading('title', text='名称')
dataTreeview.heading('rating', text='热度')
dataTreeview.place(x=200, y=180, width=1000, height=300)

#滚动条
s = tkinter.Scrollbar(dataTreeview, command=dataTreeview.yview)
s.pack(side="right", fill="y")
dataTreeview.config(yscrollcommand=s.set)
win.mainloop()

2.1思路

根据前面爬取的数据得到的数据库，实现GUI界面与SQLite数据库相连，即可查看排行榜信息，并且实现了能够查看全部信息，或者根据动漫名称关键字搜索可得相关信息。

除了可以按照标题查找，还能在此基础上拓展出按照热度、链接查找等功能。

2.2运行结果

2.3GUI设计难点

2.3.1按标题查询--模糊查询

content="'%"+selecttitle.get()+"%'"

标签：cur,--,text,AGE,width,dataTreeview,sqlite3,import,con
来源： https://blog.csdn.net/To_Ma_To/article/details/119138812

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

Python3爬虫（sqlite3存储信息）--AGE动漫网站排行榜

目标

1.爬虫代码