我的打工网信息内容抓取python脚本

45次阅读次阅读
没有评论

9月份写的一个脚本,当时有个中间商找我写脚本,爬取我的打工网内容,我简单看了一下,里面内容都是使用接口获取的,唯一的难点就是需要一个签名字段验证,说实话之前没搞过签名,简单查了一下资料才大概了解怎么搞,因为不熟悉js所以搞了几个小时才搞定,也算是不负众望吧,但是结果是戏剧性的,tm内容拿下来了那个找我的大佬说他把需求给搞错了,我。。。。。。

原需求人家是要让爬简历???你没理解错,就是注册用户投的简历。。。。。这我真搞不定。

好吧,最后人家原客户提出给100辛苦费,,,中间商抽了20,我到手80。。。我没说什么,反正就是找我搞东西先给钱吧。因为以前我一直怕接了单子结果搞不定了尴尬。。。但是从那以后真的就算我搞不定退你钱我也要先收费。今天突然看到这个脚本了,也不晓得还能用吗,反正就放出来了,当时的操作详情我也没留,因为没打算发出来的。
有不懂的可以留言,重在交流!!!

import requests
import json
import hashlib
import time

from docx import Document
from docx.enum.text import WD_BREAK

def sign(d,t):
    k="WKWeb"
    s="a323f9b6-1f04-420e-adb9-b06d142c5e63"
    
    dd=str(k) + str(t) + d + str(s)
    
    a = hashlib.md5()
    a.update(dd.encode(encoding='utf-8'))
    sign = a.hexdigest()
    return sign


def detail(id):
    t=round(int(round(time.time() * 1000)) / 1e3)
    header = {
        "Content-Type": "application/json; charset=UTF-8",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
    }
    d="{\"RecruitId\":%s,\"Lng\":\"114.52082821245915\",\"Lat\":\"38.048684212036896\",\"Address\":\"\",\"UserSign\":null}" %id
    url="https://www.wodedagong.com/ls_api/LS_RecruitServicesManager/GetRecruitDetail"
    data = {
        "AppVer":"1.0.0",
        "TimeStamp":t,
        "Lang":"CN",
        "DeviceName":"web",
        "DeviceType":"web",
        "Token":"",
        "Uid":0,
        "AppKey":"WKWeb",
        "Sign":sign(d,t),
        "Data":d
    }
    url = "https://www.wodedagong.com/ls_api/LS_RecruitServicesManager/GetRecruitDetail"
    res = requests.post(url,data=json.dumps(data), headers=header)
    data = json.loads(res.text)
    return data
    
    
def list(id):
    t=round(int(round(time.time() * 1000)) / 1e3)
    d="{\"RecordIndex\":%s,\"RecordSize\":10,\"Lng\":114.52082821245915,\"Lat\":38.048684212036896,\"VirtualLng\":120.6174,\"VirtualLat\":31.335106,\"SearchName\":\"服务员\",\"SortType\":1,\"SalaryRoundId\":0,\"IndustryId\":0,\"ProfessionalIds\":[],\"AreaId\":0,\"Pneumonia\":0}" %id
    
    header = {
        "Content-Type": "application/json; charset=UTF-8",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
        }
    data = {"AppVer":"1.0.0",
            "TimeStamp":t,
            "Lang":"CN",
            "DeviceName":"web",
            "DeviceType":"web",
            "Token":"",
            "Uid":0,
            "AppKey":"WKWeb",
            "Sign":sign(d,t),
            "Data":d
           }
    
    url = "https://www.wodedagong.com/ls_api/LS_RecruitDataManager/GetRecruitList"
    res = requests.post(url,data=json.dumps(data), headers=header)
    data = json.loads(res.text)
    # print(data)
    
    for i in data['Data']['RecordList']:
        if (i['Type']==1 | i['RecordId'] == 1):
            print("过滤广告")
        else:
    #         print(i['RecordId'])
    #         print(i['ShowName'])
    #         print(i['EnterpriseName'])
            det = detail(i['RecordId'])
			
            document = Document('test.docx')
            paragraphs = document.paragraphs
            
            #Add break line after last run
            paragraphs[0].runs[-1].add_break(WD_BREAK.LINE)
            paragraphs[0].add_run("企业名称:" + det['Data']['EnterpriseName'] + "\n")
            paragraphs[0].add_run("薪资范围:" + det['Data']['WagesView'] + "\n")
            paragraphs[0].add_run("联系电话:" + det['Data']['ContanctsPhone'] + "\n")
            paragraphs[0].add_run("发布标签:" + str(det['Data']['TagNames']) + "\n")
            paragraphs[0].add_run("来源用户:" + det['Data']['FromUser'] + "\n")
            paragraphs[0].add_run("企业地址:" + det['Data']['WorkAddress'] + "\n")
            paragraphs[0].add_run("招聘岗位:" + det['Data']['WorkPost'] + "\n")
            paragraphs[0].add_run("基本要求:" + det['Data']['WorkRequire'] + "\n")
            paragraphs[0].add_run("工作环境:" + str(det['Data']['ImageList']) + "\n")
            paragraphs[0].add_run("刷新时间:" + det['Data']['RefreshTime'] + "\n")
            paragraphs[0].add_run("位置经度:" + det['Data']['Longitude'] + "\n")
            paragraphs[0].add_run("位置纬度:" + det['Data']['Latitude'] + "\n")
            paragraphs[0].add_run("进入地图:" + "https://mapapi.qq.com/web/mapComponents/locationMarker/v/index.html?marker=coord%3A" + det['Data']['Latitude'] + "%2C" + det['Data']['Longitude'] + "%3Btitle%3A" + det['Data']['EnterpriseName'] + "%3B%27&key=TKUBZ-D24AF-GJ4JY-JDVM2-IBYKK-KEBCU&referer=tengxun&ch=uri-api&ADTAG=uri-api.other" + "\n\n\n\n")
            
            document.save('test.docx')
            #print("企业名称:" + det['Data']['EnterpriseName'])
            #print("薪资范围:" + det['Data']['WagesView'])
            #print("联系电话:" + det['Data']['ContanctsPhone'])
            #print("发布标签:" + str(det['Data']['TagNames']))
            #print("来源用户:" + det['Data']['FromUser'])
            #print("企业地址:" + det['Data']['WorkAddress'])
            #print("招聘岗位:" + det['Data']['WorkPost'])
            #print("基本要求:" + det['Data']['WorkRequire'])
            #print("工作环境:" + str(det['Data']['ImageList']))
            #print("刷新时间:" + det['Data']['RefreshTime'])
            #print("位置经度:" + det['Data']['Longitude'])
            #print("位置纬度:" + det['Data']['Latitude'])
            #print("进入地图:" + "https://mapapi.qq.com/web/mapComponents/locationMarker/v/index.html?marker=coord%3A" + det['Data']['Latitude'] + "%2C" + det['Data']['Longitude'] + "%3Btitle%3A" + det['Data']['EnterpriseName'] + "%3B%27&key=TKUBZ-D24AF-GJ4JY-JDVM2-IBYKK-KEBCU&referer=tengxun&ch=uri-api&ADTAG=uri-api.other")
            
t=round(int(round(time.time() * 1000)) / 1e3)
d="{\"RecordIndex\":0,\"RecordSize\":10,\"Lng\":114.52082821245915,\"Lat\":38.048684212036896,\"VirtualLng\":120.6174,\"VirtualLat\":31.335106,\"SearchName\":\"服务员\",\"SortType\":1,\"SalaryRoundId\":0,\"IndustryId\":0,\"ProfessionalIds\":[],\"AreaId\":0,\"Pneumonia\":0}"
header = {
    "Content-Type": "application/json; charset=UTF-8",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
    }
data = {"AppVer":"1.0.0",
        "TimeStamp":t,
        "Lang":"CN",
        "DeviceName":"web",
        "DeviceType":"web",
        "Token":"",
        "Uid":0,
        "AppKey":"WKWeb",
        "Sign":sign(d,t),
        "Data":d
       }
url = "https://www.wodedagong.com/ls_api/LS_RecruitDataManager/GetRecruitList"
res = requests.post(url,data=json.dumps(data), headers=header)
data = json.loads(res.text)
# print(data['Data']['RecordCount'])
index = 0
for i in range(round(data['Data']['RecordCount']/10)):
    print("爬取页数:" + str(i))
    list(index)
    index = index + 10
admin
版权声明:本站原创文章,由admin2021-12-15发表,共计6990字。
转载提示:除特殊说明外本站文章皆由CC-4.0协议发布,转载请注明出处。
评论(没有评论)
载入中...