> Python > Python批量爬取某财富股票网证券机构宏观研报

Python批量爬取某财富股票网证券机构宏观研报

Python 投稿 2021-10-17 20:23 68800℃ 0 评论

基本思路：

首先获取进入宏观研报的页面，分析页面知道是JS显示
获取真实json的链接，分析json的链接，可知链接里面参数含义，比如页面显示研报数量，第几页，查询起止时间，毫秒时间戳等等
解析json，得到单独研报的标题、页面真实地址
再次访问单独研报页面，获取’PDF‘的存放链接地址（这里的文件类型不一定是PDF，也有可能是excel）
按标题修改文件并下载保存

有兴趣的朋友可以在评论中分享完善其他功能：

为了防止被网站封禁，加了随机useragent，但是{代}{理}、多线程等功能还没学过，无法实现
进度条可以有，但是还在研究
研报存到指定的目录（没有就生成），并且判断文件是否存在，存在就跳过的功能
我知道我的代码还可以再精简，但是知识经验太少，暂时就这样

import requests, json, time, random,os
from bs4 import BeautifulSoup
 
def UserAgent():
    user_agent_list = ['Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',
                   'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',
                   'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',
                   'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',
                   'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36']
    UserAgent={'User-Agent': random.choice(user_agent_list)}
    return UserAgent
 
def timenow():
    last_para = int(time.time() * 1000)
    now = int(time.time())
    timeArray = time.localtime(now)
    end_Time = time.strftime("%Y-%m-%d", timeArray)
    return last_para,end_Time
 
def macResearch_page(end_Time,last_para,UserAgent):
    # 经测试pageSize最大数值为100
    macResearchPageurl = 'http://reportapi.eastmoney.com/report/jg?pageSize=100&beginTime=2018-03-19&endTime=' + str(end_Time) + '&pageNo=1&fields=&qType=3&orgCode=&author=&_=' + str(last_para)
    macResearch='http://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl='
    html = requests.get(macResearchPageurl, timeout=3, headers=UserAgent)
    text=html.text
    return text,macResearch
 
def parser_json(text,sub_url):
    djs = json.loads(text, encoding='utf-8')
    dic={}
    for item in djs['data']:
        item_publishDate=item['publishDate'][:10].split('-')[0]+item['publishDate'][:10].split('-')[1]+item['publishDate'][:10].split('-')[2]
        pdf_title=item_publishDate+item['title']
        page_url=sub_url+item["encodeUrl"]
        dic[pdf_title]=page_url
    return dic
 
def save_file(dic,UserAgent,num):
    # num是整数并且不能超过100
    a=0
    for key in dic.keys():
        url_r=dic[key]
        name=key
        real_html=requests.get(url_r,headers=UserAgent).text
        soup = BeautifulSoup(real_html, 'html.parser')
        file_url=soup.find_all('a',attrs={'class','pdf-link'})[0]['href']
        last_response=requests.get(file_url,headers=UserAgent)
        file_down=last_response.content
        file_name=name+'.'+file_url.split('.')[-1]
        content_size = int(last_response.headers['Content-Length']) / (1024*1024)
        print(file_url)
        with open(file_name,'wb') as f:
            f.write(file_down)
            f.close()
            print('成功！保存完成，文件大小：{:0.3f}MB'.format(content_size))
        a+=1
        if a==num:
            break
 
 
 
def main():
    num=int(input('注意：请输入1-100的整数：\n'))
    user_agent=UserAgent()
    last_para,end_Time=timenow()
    text,sub_url=macResearch_page(end_Time,last_para,user_agent)
    dic=parser_json(text,sub_url)
    save_file(dic,user_agent,num)
    print('全部完成！')
     
 
main()

编程笔记 » Python批量爬取某财富股票网证券机构宏观研报

相关文章

游客发表我的评论换个身份

取消评论

表情有人回复时邮件通知我

(0)个小伙伴在吐槽