ICode9

精准搜索请尝试: 精确搜索
首页 > 编程语言> 文章详细

python 网易云音乐评论爬取1

2020-01-26 11:41:19  阅读:226  来源: 互联网

标签:info comment 网易 python com 爬取 评论 total data


原文链接:
(1)Python爬取网易云音乐评论
https://www.jianshu.com/p/92950e9605c9
(2)网易云音乐评论爬虫(三):爬取歌曲的全部评论
https://yq.aliyun.com/articles/672464

#(1)[Python爬取网易云音乐评论](https://www.jianshu.com/p/92950e9605c9)
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
}

baseUrl = 'https://music.163.com'
def getHtml(url):
    r = requests.get(url, headers=headers)
    html = r.text
    return html

def getUrl():
    #从最新歌单开始
    startUrl = 'https://music.163.com/discover/playlist/?order=new'
    html = getHtml(startUrl)
    pattern =re.compile('<li>.*?<p.*?class="dec">.*?<.*?title="(.*?)".*?href="(.*?)".*?>.*?span class="s-fc4".*?title="(.*?)".*?href="(.*?)".*?</li>',re.S)
    result = re.findall(pattern,html)
    #获取歌单总页数
    pageNum = re.findall(r'<span class="zdot".*?class="zpgi">(.*?)</a>',html,re.S)[0]
    info = []
    #对第一页的歌单获取想要的信息
    for i in result:
        data = {}
        data['title'] = i[0]
        url = baseUrl+i[1]
        print url
        data['url'] = url
        data['author'] = i[2]
        data['authorUrl'] = baseUrl+i[3]
        info.append(data)
        #调用获取每个歌单里的歌曲的方法
        getSongSheet(url)
        time.sleep(random.randint(1,10))
        #这里暂时获取第一页的第一个歌单,所以用break
        break
def getSongSheet(url):
    #获取每个歌单里的每首歌的id,作为接下来post获取的关键
    html = getHtml(url)
    result = re.findall(r'<li><a.*?href="/song\?id=(.*?)">(.*?)</a></li>',html,re.S)
    result.pop()
    musicList = []
    for i in result:
        data = {}
        headers1 = {
            'Referer': 'https://music.163.com/song?id={}'.format(i[0]),
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'
        }
        musicUrl = baseUrl+'/song?id='+i[0]
        print musicUrl
        #歌曲url
        data['musicUrl'] = musicUrl
        #歌曲名
        data['title'] = i[1]
        musicList.append(data)
        postUrl = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_{}?csrf_token='.format(i[0])
        param = {
            'params': get_params(1),
            'encSecKey': get_encSecKey()
        }
        r = requests.post(postUrl,data = param,headers = headers1)
        total = r.json()
        # 总评论数
        total = int(total['total'])
        comment_TatalPage = total/20
        # 基础总页数
        print comment_TatalPage
        #判断评论页数,有余数则为多一页,整除则正好
        if total%20 != 0:
            comment_TatalPage = comment_TatalPage+1
            comment_data,hotComment_data = getMusicComments(comment_TatalPage, postUrl, headers1)
            #存入数据库的时候若出现ID重复,那么注意爬下来的数据是否只有一个
            saveToMongoDB(str(i[1]),comment_data,hotComment_data)
            print 'End!'
        else:
            comment_data, hotComment_data = getMusicComments(comment_TatalPage, postUrl, headers1)
            saveToMongoDB(str(i[1]),comment_data,hotComment_data)
            print 'End!'

        time.sleep(random.randint(1, 10))
        break
def getMusicComments(comment_TatalPage ,postUrl, headers1):
    commentinfo = []
    hotcommentinfo = []
    # 对每一页评论
    for j in range(1, comment_TatalPage + 1):
        # 热评只在第一页可抓取
        if j == 1:
            #获取评论
            r = getPostApi(j , postUrl, headers1)
            comment_info = r.json()['comments']
            for i in comment_info:
                com_info = {}
                com_info['content'] = i['content']
                com_info['author'] = i['user']['nickname']
                com_info['likedCount'] = i['likedCount']
                commentinfo.append(com_info)
            hotcomment_info = r.json()['hotComments']
            for i in hotcomment_info:
                hot_info = {}
                hot_info['content'] = i['content']
                hot_info['author'] = i['user']['nickname']
                hot_info['likedCount'] = i['likedCount']
                hotcommentinfo.append(hot_info)
        else:
            r = getPostApi(j, postUrl, headers1)
            comment_info = r.json()['comments']
            for i in comment_info:
                com_info = {}
                com_info['content'] = i['content']
                com_info['author'] = i['user']['nickname']
                com_info['likedCount'] = i['likedCount']
                commentinfo.append(com_info)
        print u'第'+str(j)+u'页爬取完毕...'
        time.sleep(random.randint(1,10))
    print commentinfo
    print '\n-----------------------------------------------------------\n'
    print hotcommentinfo
    return commentinfo,hotcommentinfo
# offset的取值为:(评论页数-1)*20,total第一页为true,其余页为false
# first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}' # 第一个参数
# 第二个参数
second_param = "010001"
# 第三个参数
third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
# 第四个参数
forth_param = "0CoJUm6Qyw8W8jud"
# 获取参数
def get_params(page): # page为传入页数
    iv = "0102030405060708"
    first_key = forth_param
    second_key = 16 * 'F'
    if(page == 1): # 如果为第一页
        first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'
        h_encText = AES_encrypt(first_param, first_key, iv)
    else:
        offset = str((page-1)*20)
        first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' %(offset,'false')
        h_encText = AES_encrypt(first_param, first_key, iv)
    h_encText = AES_encrypt(h_encText, second_key, iv)
    return h_encText

# 获取 encSecKey
def get_encSecKey():
    encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
    return encSecKey

# 加密过程
def AES_encrypt(text, key, iv):
    pad = 16 - len(text) % 16
    text = text + pad * chr(pad)
    encryptor = AES.new(key, AES.MODE_CBC, iv)
    encrypt_text = encryptor.encrypt(text)
    encrypt_text = base64.b64encode(encrypt_text)
    return encrypt_text

#获取post得到的Json
def getPostApi(j ,postUrl, headers1):
    param = {
        # 获取对应页数的params
        'params': get_params(j),
        'encSecKey': get_encSecKey()
    }
    r = requests.post(postUrl, data=param, headers=headers1)
    return r
def saveToMongoDB(musicName,comment_data,hotComment_data):
    client = pymongo.MongoClient(host='localhost',port=27017)
    db = client['Music163']
    test = db[musicName]
    test.insert(hotComment_data)
    test.insert(comment_data)
    print musicName+u'已存入数据库...'

if __name__ == '__main__':
    getUrl()
#(2)[网易云音乐评论爬虫(三):爬取歌曲的全部评论](https://yq.aliyun.com/articles/672464)
#GitHub(https://github.com/zyingzhou/wangyiyun_music/blob/master/get_comments.py)
#! /usr/bin/env python
# coding='utf-8'
'''
获取网易云音乐歌曲全部评论
Author: zhouzying
URL: https://www.zhouzying.cn
Date: 2018-09-14
Update: 2018-09-27         Add data argument.
Update: 2018-10-04         Get replied comments and add users name who shared comments.
'''
import requests
import math
import random
# pycrypto
from Crypto.Cipher import AES
import codecs
import base64

# 构造函数获取歌手信息
def get_comments_json(url, data):
    headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
             'Accept-Encoding': 'gzip, deflate',
             'Accept-Language': 'zh-CN,zh;q=0.9',
             'Connection': 'keep-alive',
             'Cookie': 'WM_TID=36fj4OhQ7NdU9DhsEbdKFbVmy9tNk1KM; _iuqxldmzr_=32; _ntes_nnid=26fc3120577a92f179a3743269d8d0d9,1536048184013; _ntes_nuid=26fc3120577a92f179a3743269d8d0d9; __utmc=94650624; __utmz=94650624.1536199016.26.8.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); WM_NI=2Uy%2FbtqzhAuF6WR544z5u96yPa%2BfNHlrtTBCGhkg7oAHeZje7SJiXAoA5YNCbyP6gcJ5NYTs5IAJHQBjiFt561sfsS5Xg%2BvZx1OW9mPzJ49pU7Voono9gXq9H0RpP5HTclE%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eed5cb8085b2ab83ee7b87ac8c87cb60f78da2dac5439b9ca4b1d621f3e900b4b82af0fea7c3b92af28bb7d0e180b3a6a8a2f84ef6899ed6b740baebbbdab57394bfe587cd44b0aebcb5c14985b8a588b6658398abbbe96ff58d868adb4bad9ffbbacd49a2a7a0d7e6698aeb82bad779f7978fabcb5b82b6a7a7f73ff6efbd87f259f788a9ccf552bcef81b8bc6794a686d5bc7c97e99a90ee66ade7a9b9f4338cf09e91d33f8c8cad8dc837e2a3; JSESSIONID-WYYY=G%5CSvabx1X1F0JTg8HK5Z%2BIATVQdgwh77oo%2BDOXuG2CpwvoKPnNTKOGH91AkCHVdm0t6XKQEEnAFP%2BQ35cF49Y%2BAviwQKVN04%2B6ZbeKc2tNOeeC5vfTZ4Cme%2BwZVk7zGkwHJbfjgp1J9Y30o1fMKHOE5rxyhwQw%2B%5CDH6Md%5CpJZAAh2xkZ%3A1536204296617; __utma=94650624.1052021654.1536048185.1536199016.1536203113.27; __utmb=94650624.12.10.1536203113',
             'Host': 'music.163.com',
             'Referer': 'http://music.163.com/',
             'Upgrade-Insecure-Requests': '1',
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                           'Chrome/66.0.3359.181 Safari/537.36'}
    try:
        r = requests.post(url, headers=headers, data=data)
        r.encoding = "utf-8"
        if r.status_code == 200:
            # 返回json格式的数据
            return r.json()
    except:
        print("爬取失败!")

# 生成16个随机字符
def generate_random_strs(length):
    string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    # 控制次数参数i
    i = 0
    # 初始化随机字符串
    random_strs  = ""
    while i < length:
        e = random.random() * len(string)
        # 向下取整
        e = math.floor(e)
        random_strs = random_strs + list(string)[e]
        i = i + 1
    return random_strs

# AES加密
def AESencrypt(msg, key):
    # 如果不是16的倍数则进行填充(paddiing)
    padding = 16 - len(msg) % 16
    # 这里使用padding对应的单字符进行填充
    msg = msg + padding * chr(padding)
    # 用来加密或者解密的初始向量(必须是16位)
    iv = '0102030405060708'
    cipher = AES.new(key, AES.MODE_CBC, iv)
    # 加密后得到的是bytes类型的数据
    encryptedbytes = cipher.encrypt(msg)
    # 使用Base64进行编码,返回byte字符串
    encodestrs = base64.b64encode(encryptedbytes)
    # 对byte字符串按utf-8进行解码
    enctext = encodestrs.decode('utf-8')
    return enctext

# RSA加密
def RSAencrypt(randomstrs, key, f):
    # 随机字符串逆序排列
    string = randomstrs[::-1]
    # 将随机字符串转换成byte类型数据
    text = bytes(string, 'utf-8')
    seckey = int(codecs.encode(text, encoding='hex'), 16)**int(key, 16) % int(f, 16)
    return format(seckey, 'x').zfill(256)

# 获取参数
def get_params(page):
    # msg也可以写成msg = {"offset":"页面偏移量=(页数-1) * 20", "limit":"20"},offset和limit这两个参数必须有(js)
    # limit最大值为100,当设为100时,获取第二页时,默认前一页是20个评论,也就是说第二页最新评论有80个,有20个是第一页显示的
    # msg = '{"rid":"R_SO_4_1302938992","offset":"0","total":"True","limit":"100","csrf_token":""}'
    # 偏移量
    offset = (page-1) * 20
    # offset和limit是必选参数,其他参数是可选的,其他参数不影响data数据的生成
    msg = '{"offset":' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}'
    key = '0CoJUm6Qyw8W8jud'
    f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
    e = '010001'
    enctext = AESencrypt(msg, key)
    # 生成长度为16的随机字符串
    i = generate_random_strs(16)
    # 两次AES加密之后得到params的值
    encText = AESencrypt(enctext, i)
    # RSA加密之后得到encSecKey的值
    encSecKey = RSAencrypt(i, e, f)
    return encText, encSecKey

def hotcomments(html, songname, i, pages, total, filepath):
    # 写入文件
    with open(filepath, 'a', encoding='utf-8') as f:
        f.write("正在获取歌曲{}的第{}页评论,总共有{}页{}条评论!\n".format(songname, i, pages, total))
    print("正在获取歌曲{}的第{}页评论,总共有{}页{}条评论!\n".format(songname, i, pages, total))
    # 精彩评论
    m = 1
    # 键在字典中则返回True, 否则返回False
    if 'hotComments' in html:
        for item in html['hotComments']:
            # 提取发表热门评论的用户名
            user = item['user']
            # 写入文件
            print("热门评论{}: {} : {}    点赞次数: {}".format(m, user['nickname'], item['content'], item['likedCount']))
            with open(filepath, 'a', encoding='utf-8') as f:
                f.write("热门评论{}: {} : {}   点赞次数: {}\n".format(m, user['nickname'], item['content'], item['likedCount']))
                # 回复评论
                if len(item['beReplied']) != 0:
                    for reply in item['beReplied']:
                        # 提取发表回复评论的用户名
                        replyuser = reply['user']
                        print("回复:{} : {}".format(replyuser['nickname'], reply['content']))
                        f.write("回复:{} : {}\n".format(replyuser['nickname'], reply['content']))
            m += 1

def comments(html, songname, i, pages, total, filepath):
    with open(filepath, 'a', encoding='utf-8') as f:
        f.write("\n正在获取歌曲{}的第{}页评论,总共有{}页{}条评论!\n".format(songname, i, pages, total))
    print("\n正在获取歌曲{}的第{}页评论,总共有{}页{}条评论!\n".format(songname, i, pages, total))
    # 全部评论
    j = 1
    for item in html['comments']:
        # 提取发表评论的用户名
        user = item['user']
        print("全部评论{}: {} : {}    点赞次数: {}".format(j, user['nickname'], item['content'], item['likedCount']))
        with open(filepath, 'a', encoding='utf-8') as f:
            f.write("全部评论{}: {} : {}   点赞次数: {}\n".format(j, user['nickname'], item['content'], item['likedCount']))
            # 回复评论
            if len(item['beReplied']) != 0:
                for reply in item['beReplied']:
                    # 提取发表回复评论的用户名
                    replyuser = reply['user']
                    print("回复:{} : {}".format(replyuser['nickname'], reply['content']))
                    f.write("回复:{} : {}\n".format(replyuser['nickname'], reply['content']))
        j += 1

def main():
    # 歌曲id号
    songid = 38592976
    # 歌曲名字
    songname = "Dream it possible"
    # 文件存储路径
    filepath = songname + ".txt"
    page = 1
    params, encSecKey = get_params(page)
    url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(songid) + '?csrf_token='
    data = {'params': params, 'encSecKey': encSecKey}
    # url = 'https://music.163.com/#/song?id=19292984'
    # 获取第一页评论
    html = get_comments_json(url, data)
    # 评论总数
    total = html['total']
    # 总页数
    pages = math.ceil(total / 20)
    hotcomments(html, songname, page, pages, total, filepath)
    comments(html, songname, page, pages, total, filepath)

    # 开始获取歌曲的全部评论
    page = 2
    while page <= pages:
        params, encSecKey = get_params(page)
        data = {'params': params, 'encSecKey': encSecKey}
        html = get_comments_json(url, data)
        # 从第二页开始获取评论
        comments(html, songname, page, pages, total, filepath)
        page += 1

if __name__ == "__main__":
    main()
Jxufe渣渣斯 发布了57 篇原创文章 · 获赞 107 · 访问量 17万+ 私信 关注

标签:info,comment,网易,python,com,爬取,评论,total,data
来源: https://blog.csdn.net/JxufeCarol/article/details/104086216

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有