首页>比特动态>动态IP如何抓取快递信息?
动态IP如何抓取快递信息?

运营人员需要抓取快递信息,用的第三方的不太靠谱,自己前端遍历,发现每一个IP抓取50条,就被屏蔽了,也可以每秒5~6个慢慢抓,测试过不会被屏蔽

准备工作


动态IP如何抓取快递信息?


用的request去抓取信息和获取IP地址 

前端对接 用的是koa2 ,koa-bodyparser处理post请求,koa2-cors处理前端请求跨域


动态IP设置是网上百度到了,自己做了修改


 package


{

  "name": "ip",

  "version": "1.0.0",

  "description": "",

  "main": "index.js",

  "scripts": {

    "test": "echo \"Error: no test specified\" && exit 1"

  },

  "author": "",

  "license": "ISC",

  "dependencies": {

    "bluebird": "^3.5.1",

    "koa": "^2.5.2",

    "koa-bodyparser": "^3.2.0",

    "koa2-cors": "^2.0.6",

    "query-string": "^6.1.0",

    "request": "^2.88.0"

  }

}

 


一下是动态IP设置和快递信息请求


const request = require("request");

 

const Promise = require("bluebird");

const queryString = require('query-string');

 

const userAgents = [

    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',

    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',

    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',

    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',

    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',

    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',

    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',

    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',

    'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',

    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',

    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',

    'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',

    'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',

    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'

];

//这里只做测试,所以用变量存,而实际应用中,应该使用数据缓存

const expiryTime = 1 * 60 * 1000;// 过期间隔时间,毫秒

let ips = null; //代理ip

let time = Date.now();// 存储代理IP的时间,判断是否过期,如果过期重新请求

 

/**

 * 请求免费代理,可做缓存,这里就存在变量中,只做测试

 */

const getProxyList = (flag) => {

    return new Promise((resolve, reject) => {

        const nowDate = Date.now();

        /*if (!flag && nowDate - time < expiryTime) {

            console.log('直接return')

            resolve(ips);

            return;

        }*/

      // 动态IP的接口

        const apiURL = 'http://xx.xxx.xxx.xx:xxxx/Index-generate_api_url.html?packid=7&fa=5&qty=10&port=1&format=json&ss=5&css=&pro=&city=';

        const options = {

            method: 'GET',

            url: apiURL,

            gzip: true,

            encoding: null,

        };

        request(options, (error, response, body) => {

            console.log('body:', JSON.parse(body.toString()));

 

            const ret = JSON.parse(body.toString()).success === 'false' ? ips : JSON.parse(body.toString()).data.map(res => res.IP + ':' + res.Port);

            ips = ret;

            console.log(ret)

            time = Date.now();

            resolve(ret);

        });

    })

}

//爬取网页

async function reptile(data) {

    return new Promise((resolve, reject) => {

        let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];

        let ip = ips[parseInt(Math.random() * ips.length)];

        console.log('ip:', ip);

        let useIp = `http://${ip}`;

        const options = {

            method: 'GET',

            url: 'http://www.kuaidi100.com/query?' + queryString.stringify(data),

            gzip: true,

            encoding: null,

            headers: {

                'User-Agent': userAgent, //动态设置浏览器头部信息

            },

            //proxy: useIp, //动态设置代理ip

            timeout: 8000

        };

        request(options, (error, response, body) => {

            //这里是因为有些ip 不能访问,所以如果没有访问到,则继续掉用其他ip 访问

            if (error) {

                console.log(`爬取页面失败,${error},正在重新寻找代理ip... ×`);

                // 如果是代理ip无法访问,另外选择一个代理

            } else {

                console.log('爬取页面成功,  √', body.toString());

                console.log('爬取页面成功,  √', data);

 

            }

            resolve(body)

        })

    });

}

//启动方法

async function startFun(data) {

    if (!ips) {

        await getProxyList();

    }

    const body = await reptile(data);//爬取网页

    if (!body || body.toString().split('非法访问:IP禁止访问').length > 1) {

        await getProxyList(true);

        return { code: 400, msg: '抓取失败' };

    }

    //解析html

    return { code: 200, data: body.toString(), msg: '' };

}

//启动方法

module.exports = startFun;

  


koa的代码


const Koa = require('koa')

const app = new Koa()

const startFun = require('./startFun')

const bodyParser = require('koa-bodyparser')

 

app.use(bodyParser())

app.use(async (ctx, next) => {

    // 允许来自所有域名请求

    ctx.set("Access-Control-Allow-Origin", "*");

    ctx.set("Access-Control-Allow-Methods", "OPTIONS, GET, PUT, POST, DELETE");

    ctx.set("Access-Control-Allow-Headers", "x-requested-with, accept, origin, content-type");

    ctx.set("Content-Type", "application/json;charset=utf-8");

    ctx.set("Access-Control-Allow-Credentials", true);

    ctx.set("Access-Control-Max-Age", 300);

    ctx.set("Access-Control-Expose-Headers", "myData");

    await next();

})

app.use(async (ctx) => {

     if (ctx.url && ctx.url.split('?')[0] === '/chakuaidi' && ctx.method === 'GET') {

        const data = await startFun(ctx.query);

        let postData = ctx.request.body

        ctx.body = data

    } 

})

 

app.listen(3000, () => {

    console.log('demo2 is run')

})

  前端代码


  var a = ['xxxxxxxx', 'xxxxxxx', 'xxxxxx', ]

        function aa(index) {

            index = index || 0;

            if (index >= a.length) return;

            setTimeout(function () {

                $.get('http://xxx.xx.xx.xxx:xxxx/chakuaidi?type=yunda&postid=' + a[index] + '&temp=' + Math.random(), function (res) {

                    if (res.code === 200) {

                        res = res.data,

                            res = JSON.parse(res);

                        console.log((res.nu || a[index]) + ',' + index + ',' + (!res.data.length ? '暂无数据' : res.data[0].context));

                        aa(index + 1)

                    } else {

                        aa(index)

                    }

 

                })

            },1000)

        }

        console.log(a.length)

  


请求是比每秒查几个快了很多 发现到了150个 就开始报禁止IP ,应该是IP太少了,不过比一条一条差感觉快很多,量少推荐,

前端拿到数据 在写一个前端的导出 数据直接导出来 直接就可以用了