递归爬取今日头条指定用户一个月内发表的所有文章,视频,微头条(代码片段)

我姓刘却留不住你的心 我姓刘却留不住你的心     2022-12-23     565

关键词:

最近找工作,爬虫面试的一个面试题。涉及的反爬还是比较全面的,结果公司要求高,要解决视频链接时效性问题,凉凉。

直接上代码

import requests
import time
from datetime import datetime
import json
import execjs
import hashlib
import re
import csv
from zlib import crc32
from base64 import b64decode
import random
import urllib3
import os
import threading
from queue import Queue
from lxml import etree

# 查看js版本信息
# print(execjs.get().name)
# 屏蔽ssl验证警告
urllib3.disable_warnings()

"""
需要nodejs环境,需要修改subprocess.py文件内的class Popen(object)类中的__init__(..encode=\'utf-8)否则调用js文件时会报错
请求列表页时.py文件中的ua头要与js文件中一致,不然很难请求到数据,请求详情页时要用ua池否则会封浏览器/ip
会有一些空白表格,是因为该账号七天内为发表内容,或者该账号被封禁
输出结果在此文件所在根目录下/toutiao/
右键运行此py文件,newsign.js文件,toutiao.csv文件需在同一文件夹内
爬取的视频有时效性
"""


# 定义ua池
def headers():
    # 各种PC端
    user_agent_list = [
        # Opera
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
        "Opera/8.0 (Windows NT 5.1; U; en)",
        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
        # Firefox
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
        # Safari
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
        # chrome
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
        # 360
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
        # 淘宝浏览器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
        # 猎豹浏览器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
        # QQ浏览器
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
        # sogou浏览器
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
        # maxthon浏览器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
        # UC浏览器
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    ]
    UserAgent = random.choice(user_agent_list)
    headers = \'User-Agent\': UserAgent
    return headers


headers_a = 
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",

# 代理ip
proxy = 
    \'http\': \'183.57.44.62:808\'

# cookies值
cookies = \'s_v_web_id\': \'b68312370162a4754efb0510a0f6d394\'


# 获取_signature
def get_signature(user_id, max_behot_time):
    with open(\'newsign.js\', \'r\', encoding=\'utf-8\') as f:
        jsData = f.read()
    execjs.get()
    ctx = execjs.compile(jsData).call(\'tac\', str(user_id) + str(
        max_behot_time))  # 复原TAC.sign(userInfo.id + "" + i.param.max_behot_time)
    return ctx


# 获取as,cp
def get_as_cp():  # 该函数主要是为了获取as和cp参数,程序参考今日头条中的加密js文件:home_4abea46.js
    zz = 
    now = round(time.time())
    # print(now)  # 获取当前计算机时间
    e = hex(int(now)).upper()[2:]  # hex()转换一个整数对象为16进制的字符串表示
    # print(\'e:\', e)
    a = hashlib.md5()  # hashlib.md5().hexdigest()创建hash对象并返回16进制结果
    # print(\'a:\', a)
    a.update(str(int(now)).encode(\'utf-8\'))
    i = a.hexdigest().upper()
    # print(\'i:\', i)
    if len(e) != 8:
        zz = \'as\': \'479BB4B7254C150\',
              \'cp\': \'7E0AC8874BB0985\'
        return zz
    n = i[:5]
    a = i[-5:]
    r = \'\'
    s = \'\'
    for i in range(5):
        s = s + n[i] + e[i]
    for j in range(5):
        r = r + e[j + 3] + a[j]
    zz = 
        \'as\': \'A1\' + s + e[-3:],
        \'cp\': e[0:3] + r + \'E1\'
    
    # print(\'zz:\', zz)
    return zz


# 获取as,cp,_signature(弃用)
def get_js():
    f = open(r"juejin.js", \'r\', encoding=\'UTF-8\')  ##打开JS文件
    line = f.readline()
    htmlstr = \'\'
    while line:
        htmlstr = htmlstr + line
        line = f.readline()
    ctx = execjs.compile(htmlstr)
    return ctx.call(\'get_as_cp_signature\')


# print(json.loads(get_js())[\'as\'])


# 文章数据
break_flag = []


def wenzhang(url=None, max_behot_time=0, n=0, csv_name=0):
    max_qingqiu = 50
    headers1 = [\'发表时间\', \'标题\', \'来源\', \'所有图片\', \'文章内容\']
    first_url = \'https://www.toutiao.com/c/user/article/?page_type=1&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s\' % (
        url.split(\'/\')[-2], max_behot_time, get_as_cp()[\'as\'], get_as_cp()[\'cp\'],
        get_signature(url.split(\'/\')[-2], max_behot_time))
    while n < max_qingqiu and not break_flag:
        try:
            # print(url)
            r = requests.get(first_url, headers=headers_a, cookies=cookies)
            data = json.loads(r.text)
            # print(data)
            max_behot_time = data[\'next\'][\'max_behot_time\']
            if max_behot_time:
                article_list = data[\'data\']
                for i in article_list:
                    try:
                        if i[\'article_genre\'] == \'article\':
                            res = requests.get(\'https://www.toutiao.com/i\' + i[\'group_id\'], headers=headers(),
                                               cookies=cookies)
                            # time.sleep(1)
                            article_title = re.findall("title: \'(.*?)\'", res.text)
                            article_content = re.findall("content: \'(.*?)\'", res.text, re.S)[0]
                            # pattern = re.compile(r"[(a-zA-Z~\\-_!@#$%\\^\\+\\*&\\\\\\/\\?\\|:\\.<>()\';=)*|\\d]")
                            # article_content = re.sub(pattern, \'\', article_content[0])
                            article_content = article_content.replace(\'&quot;\', \'\').replace(\'u003C\', \'<\').replace(
                                \'u003E\',
                                \'>\').replace(
                                \'&#x3D;\',
                                \'=\').replace(
                                \'u002F\', \'/\').replace(\'\\\\\', \'\')
                            article_images = etree.HTML(article_content)
                            article_image = article_images.xpath(\'//img/@src\')
                            article_time = re.findall("time: \'(.*?)\'", res.text)
                            article_source = re.findall("source: \'(.*?)\'", res.text, re.S)
                            result_time = []
                            [result_time.append(i) for i in
                             str(article_time[0]).split(\' \')[0].replace(\'-\', \',\').split(\',\')]
                            # print(result_time)
                            cha = (datetime.now() - datetime(int(result_time[0]), int(result_time[1]),
                                                             int(result_time[2]))).days
                            # print(cha)
                            if 30 < cha <= 32:
                                # print(\'完成\')
                                # break_flag.append(1)
                                # break
                                continue
                            if cha > 32:
                                print(\'完成\')
                                break_flag.append(1)
                                break
                            row = \'发表时间\': article_time[0], \'标题\': article_title[0].strip(\'&quot;\'),
                                   \'来源\': article_source[0],\'所有图片\':article_image,
                                   \'文章内容\': article_content.strip()
                            with open(\'/toutiao/\' + str(csv_name) + \'文章.csv\', \'a\', newline=\'\', encoding=\'gb18030\')as f:
                                f_csv = csv.DictWriter(f, headers1)
                                # f_csv.writeheader()
                                f_csv.writerow(row)
                            print(\'正在爬取文章:\', article_title[0].strip(\'&quot;\'), article_time[0],
                                  \'https://www.toutiao.com/i\' + i[\'group_id\'])
                            time.sleep(1)
                        else:
                            pass
                    except Exception as e:
                        print(e, \'https://www.toutiao.com/i\' + i[\'group_id\'])
                wenzhang(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n)
            else:
                pass
        except KeyError:
            n += 1
            print(\'\' + str(n) + \'次请求\', first_url)
            time.sleep(1)
            if n == max_qingqiu:
                print(\'请求超过最大次数\')
                break_flag.append(1)
            else:
                pass
        except Exception as e:
            print(e)
    else:
        pass

        # print(max_behot_time)
        # print(data)


# 文章详情页数据(已合并到文章数据)
def get_wenzhang_detail(url, csv_name=0):
    headers1 = [\'发表时间\', \'标题\', \'来源\', \'文章内容\']
    res = requests.get(url, headers=headers_a, cookies=cookies)
    # time.sleep(1)
    article_title = re.findall("title: \'(.*?)\'", res.text)
    article_content = re.findall("content: \'(.*?)\'", res.text, re.S)
    pattern = re.compile(r"[(a-zA-Z~\\-_!@#$%\\^\\+\\*&\\\\\\/\\?\\|:\\.<>()\';=)*|\\d]")
    article_content = re.sub(pattern, \'\', article_content[0])
    article_time = re.findall("time: \'(.*?)\'", res.text)
    article_source = re.findall("source: \'(.*?)\'", res.text, re.S)
    result_time = []
    [result_time.append(i) for i in str(article_time[0]).split(\' \')[0].replace(\'-\', \',\').split(\',\')]
    # print(result_time)
    cha = (datetime.now() - datetime(int(result_time[0]), int(result_time[1]), int(result_time[2]))).days
    # print(cha)
    if cha > 8:
        return None

    row = \'发表时间\': article_time[0], \'标题\': article_title[0].strip(\'&quot;\'), \'来源\': article_source[0],
           \'文章内容\': article_content.strip()
    with open(\'/toutiao/\' + str(csv_name) + \'文章.csv\', \'a\', newline=\'\')as f:
        f_csv = csv.DictWriter(f, headers1)
        # f_csv.writeheader()
        f_csv.writerow(row)
    print(\'正在爬取文章:\', article_title[0].strip(\'&quot;\'), article_time[0], url)
    time.sleep(0.5)
    return \'ok\'


# 视频数据
break_flag_video = []


def shipin(url, max_behot_time=0, csv_name=0, n=0):
    max_qingqiu = 20
    headers2 = [\'视频发表时间\', \'标题\', \'来源\', \'视频链接\']
    first_url = \'https://www.toutiao.com/c/user/article/?page_type=0&user_id=%s&max_behot_time=%s&count=20&as=%s&cp=%s&_signature=%s\' % (
        url.split(\'/\')[-2], max_behot_time, get_as_cp()[\'as\'], get_as_cp()[\'cp\'],
        get_signature(url.split(\'/\')[-2], max_behot_time))
    while n < max_qingqiu and not break_flag_video:
        try:
            res = requests.get(first_url, headers=headers_a, cookies=cookies)
            data = json.loads(res.text)
            # print(data)
            max_behot_time = data[\'next\'][\'max_behot_time\']
            if max_behot_time:
                video_list = data[\'data\']
                for i in video_list:
                    try:
                        start_time = i[\'behot_time\']
                        video_title = i[\'title\']
                        video_source = i[\'source\']
                        detail_url = \'https://www.ixigua.com/i\' + i[\'item_id\']

                        resp = requests.get(detail_url, headers=headers())
                        r = str(random.random())[2:]
                        url_part = "/video/urls/v/1/toutiao/mp4/?r=".format(
                            re.findall(\'"video_id":"(.*?)"\', resp.text)[0], r)
                        s = crc32(url_part.encode())
                        api_url = "https://ib.365yg.com&s=".format(url_part, s)
                        resp = requests.get(api_url, headers=headers())
                        j_resp = resp.json()
                        video_url = j_resp[\'data\'][\'video_list\'][\'video_1\'][\'main_url\']
                        video_url = b64decode(video_url.encode()).decode()
                        # print((int(str(time.time()).split(\'.\')[0])-start_time)/86400)
                        if 30 < (int(str(time.time()).split(\'.\')[0]) - start_time) / 86400 <= 32:
                            # print(\'完成\')
                            # break_flag_video.append(1)
                            continue
                        if (int(str(time.time()).split(\'.\')[0]) - start_time) / 86400 > 32:
                            print(\'完成\')
                            break_flag_video.append(1)
                            break
                        row = \'视频发表时间\': time.strftime(\'%Y-%m-%d %H:%M:%S\', time.localtime(start_time)),
                               \'标题\': video_title, \'来源\': video_source,
                               \'视频链接\': video_url
                        with open(\'/toutiao/\' + str(csv_name) + \'视频.csv\', \'a\', newline=\'\', encoding=\'gb18030\')as f:
                            f_csv = csv.DictWriter(f, headers2)
                            # f_csv.writeheader()
                            f_csv.writerow(row)
                        print(\'正在爬取视频:\', video_title, detail_url, video_url)
                        time.sleep(3)
                    except Exception as e:
                        print(e, \'https://www.ixigua.com/i\' + i[\'item_id\'])
                shipin(url=url, max_behot_time=max_behot_time, csv_name=csv_name, n=n)
        except KeyError:
            n += 1
            print(\'\' + str(n) + \'次请求\', first_url)
            time.sleep(3)
            if n == max_qingqiu:
                print(\'请求超过最大次数\')
                break_flag_video.append(1)
        except Exception as e:
            print(e)
    else:
        pass


# 微头条
break_flag_weitoutiao = []


def weitoutiao(url, max_behot_time=0, n=0, csv_name=0):
    max_qingqiu = 20
    headers3 = [\'微头条发表时间\', \'来源\', \'标题\', \'文章内图片\', \'微头条内容\']
    while n < max_qingqiu and not break_flag_weitoutiao:
        try:

            first_url = \'https://www.toutiao.com/api/pc/feed/?category=pc_profile_ugc&utm_source=toutiao&visit_user_id=%s&max_behot_time=%s\' % (
                url.split(\'/\')[-2], max_behot_time)
            # print(first_url)
            res = requests.get(first_url, headers=headers_a, cookies=cookies)
            data = json.loads(res.text)
            # print(data)
            max_behot_time = data[\'next\'][\'max_behot_time\']
            weitoutiao_list = data[\'data\']
            for i in weitoutiao_list:
                try:
                    detail_url = \'https://www.toutiao.com/a\' + str(i[\'concern_talk_cell\'][\'id\'])
                    # print(detail_url)
                    resp = requests.get(detail_url, headers=headers(), cookies=cookies)
                    start_time = re.findall("time: \'(.*?)\'", resp.text, re.S)
                    weitoutiao_name = re.findall("name: \'(.*?)\'", resp.text, re.S)
                    weitoutiao_title = re.findall("title: \'(.*?)\'", resp.text, re.S)
                    weitoutiao_images = re.findall(\'网站爬取-案例三:今日头条抓取(ajax抓取js数据)

今日头条这类的网站制作,从数据形式,CSS样式都是通过数据接口的样式来决定的,所以它的抓取方法和其他网页的抓取方法不太一样,对它的抓取需要抓取后台传来的JSON数据,先来看一下今日头条的源码结构:我们抓取文章... 查看详情

python3爬取今日头条有关《人民的名义》文章

Python3爬取今日头条有关《人民的名义》文章最近一直在看Python的基础语法知识,五一假期手痒痒想练练,正好《人民的名义》刚结束,于是决定扒一下头条上面的人名的名义文章,试试技术同时可以集中看一下大... 查看详情

如何区分今日头条与今日头条极

参考技术A今日头条和今日头条极速版区别有:内存大小不一样、定位人群不一样、功能特色不一样、界面设计不一样。1、内存大小不一样今日头条极App:4.7M。今日头条极速版App:安装包4.7M。2、定位人群不一样普通版:面向的... 查看详情

怎么在今日头条上发布新闻,或者做广告投放的?

怎么在今日头条上发布新闻,或者做广告投放的?就是把信息,主动推送给用户的那种广告,或者新闻。广告投放自己不好做,要说自己在上边发新闻或许还可以今日头条如何发布文章?第一步,登陆今日头条官网申请账号第二... 查看详情

爬取今日头条中的图片

今日头条搜索:cos.网址:https://www.toutiao.com/search/?keyword=cos 分析1在network的doc中的Preview,看到只有一句话,并没有页面的信息,所以判定存在异步加载。  分析2 在XHR中,果然找到相关的json数据。注意,只有key值... 查看详情

个性化推荐系统---今日头条等的内容划分分类

...。            今日头条以前进入各大app的流量主要被几部分刮分,一个是app内搜索、一个是固定频道、、一个是用户在闲逛 查看详情

爬取今日头条

importreimportrequestsimportjson,osfromurllibimportrequestdefget_detail(url,title):   headers=       ‘User-Agent‘:‘Mozilla/5.0(WindowsNTr6.1;WOW64)Ap 查看详情

今日头条头条号图文发布页面的“扩展链接”是干嘛用的?

这玩意不知道是干嘛用的?是用于引流到外部网站用的头条号创作者可在图文或视频详情页的固定位置插入外部链接,将用户引流至帐号主页、其他文章/视频页面、企业官方网站、店铺、活动H5等地址。参考文章:对... 查看详情

今日头条爬虫

今日头条是一个js动态加载的网站,尝试了两种方式爬取,一是页面直接提取,一是通过接口提取:version1:直接页面提取#coding=utf-8#今日头条fromlxmlimportetreeimportrequestsimporturllib2,urllibdefget_url():url=‘https://www.toutiao.com/ch/news_hot/‘gl... 查看详情

今日头条街拍图片爬取(代码片段)

importreimportrequestsimportosfromurllibimportrequestimportjsonfrommysql_tuimportmysql_connheaders=‘user-agent‘:‘Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/67.0.3396.99 查看详情

以头条为例:谈谈推荐策略

首先,为了方便读者更好的理解本文,笔者将以今日头条(建议在手机上下好“今日头条”APP,且最好注册一个头条号并登陆到后台)这款产品为例输出一些内容推荐(分发)相关的“干货”。回到标题,何为推荐?简言之:“... 查看详情

用接口爬取今日头条图片

#encoding:utf8importrequestsimportjsonimportredemo=requests.get(‘http://www.toutiao.com/api/pc/feed/?category=gallery_detail&utm_source=toutiao&max_behot_time=0&as=A1E5F9D180C2473&cp=5 查看详情

2018今日头条笔试(第二题)

题目描述给定一个数组序列,需要选出一个区间,使得该区间是所有区间中经过如下计算的值最大的一个。区间中的最小数*区间所有数的和最后程序输出经过计算后的最大值即可,不需要输出具体的区间。如给定序列[6,2,1]可得... 查看详情

今日头条的核心架构解析

今日头条创立于2012年3月,到目前仅4年时间。从十几个工程师开始研发,到上百人,再到200余人。产品线由内涵段子,到今日头条,今日特卖,今日电影等产品线。一、产品背景今日头条是为用户提供个性化资讯客户端。下面就... 查看详情

###好好好###今日头条推荐算法原理全文详解

本次分享将主要介绍今日头条推荐系统概览以及内容分析、用户标签、评估分析,内容安全等原理。一、系统概览推荐系统,如果用形式化的方式去描述实际上是拟合一个用户对内容满意度的函数,这个函数需要输入... 查看详情

pythonrequests爬取今日头条,为啥获取不了网页内容

headers='user-agent':'Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/53.0.2785.104Safari/537.36Core/1.53.4882.400QQBrowser/9.7.13059.400'response=requests.get('http://toutiao.com/group/6552087122092753412',headers=headers)print(response.text)-------... 查看详情

头条笔试题2018后端第二批(代码片段)

...签(空格分隔):笔试题描述:为了不断优化推荐效果,今日头条每天要存储和处理海量数据。假设有这样一种场景:我们对用户按照它们的注册时间先后来标号,对于一类文章,每个用户都有不同的喜好值,我们会想知道某一... 查看详情

知乎上线“明日头条”,亮剑直指今日头条?

  4月1日凌晨,知乎悄然上线了“资讯类内容”,并对外宣称发布了“明日头条”,意图向移动资讯开始发起进攻。来自知乎内部员工发布的《知乎重磅发布[明日头条],直接促使领结婚证免费》愚人文章中煞有介事地称:“... 查看详情