V2EX = way to explore
V2EX 是一个关于分享和探索的地方
现在注册
已注册用户请  登录
V2EX 提问指南
U87
V2EX  ›  问与答

Python 爬虫 ip 被封,公司给了个付费快代理接口,我先验证付费接口 ip 的可用性,然后拿来爬目标网站,还是出现

  •  
  •   U87 · 2018-06-19 20:22:23 +08:00 · 2159 次点击
    这是一个创建于 2333 天前的主题,其中的信息可能已经有所发展或是发生改变。

    import requests from lxml import etree import time, random from random import choice

    def get_proxy(): url = 'http://svip.kuaidaili.com/api/getproxy/?orderid=&num=1&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=2&an_an=1&an_ha=1&quality=2&sep=1' proxy_temp = requests.get(url=url, timeout=1).text

    proxy = {'http':'http://{}'.format(proxy_temp)}
    if requests.get(url='http://nj.58.com/chuzu/?key=%E7%A7%9F%E6%88%BF', proxies=proxy).status_code == 200:
        return proxy
    else:
        get_proxy()
    

    def crawl(): frist_url = 'http://nj.58.com/chuzu'

    headers = [{'User-Agent':'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'}, {'User-Agent':'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50'}, {'User-Agent':'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0;'}, {'User-Agent':'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1'}, {'User-Agent':'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11'}, {'User-Agent':'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)'}, {'User-Agent':'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)'}, {'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11'}]
    
    s = requests.session()
    s.keep_alive = False
    
    try:
        resp = requests.get(url=frist_url, timeout=0.5).text
    
    except requests.exceptions.RequestException as e:
        print(e)
    
    
    
    attr = etree.HTML(resp)
    
    max_page = attr.xpath('//div[@class="pager"]/a/span/text()')[-2]
    
    for page in range(1, int(max_page)+1):
    
        next_url = frist_url + "/pn" + str(page)
    
        response = requests.get(url=next_url, proxies=get_proxy(), timeout=1, headers=random.choice(headers)).text
    
        attr = etree.HTML(response)
    
        detail_urls = attr.xpath('//ul[@class="listUl"]/li/div[@class="img_list"]/a/@href')
    
        for detail_url in detail_urls:
    
            time.sleep(random.random()*3)
    
            try:
                s = requests.session()
                s.keep_alive = False
    
                r = requests.get(url=detail_url, proxies=get_proxy(), timeout=1, headers=random.choice(headers)).text
    
            except requests.exceptions.RequestException as e:
    
                print(e)
    
            html = etree.HTML(r)
    
            if "pinpaigongyu" in detail_url:
                phone = str(html.xpath('//div[@class="phonenum getPrivateCallBtnStyle"]/text()'))
                rent_type = html.xpath('//div[@class="housedetail center cf"]/h2/text()')[0].split()[0].split('] ')[0].split(' [')[1]
                area = html.xpath('//ul[@class="house-info-list"]/li[1]/span/text()')[0].split()[0]+"平"
                room_type = html.xpath('//ul[@class="house-info-list"]/li[2]/span/text()')[0].split()[0]
                addres = html.xpath('//ul[@class="house-info-list"]/li[4]/span/text()')[0].strip()
                traffic = str(html.xpath('//ul[@class="house-info-list"]/li[5]/span/text()'))
                pictures = html.xpath('//ul[@id="pic-list"]/li/img/@lazy_src')
                house_description = html.xpath('//p[@id="desc"]/text()')[0].replace(' ','')
                print(phone)
    
    
    
    
            else:
                phone = str(html.xpath('//div[@class="house-chat-phonenum"]/p[@class="phone-num"]/text()'))
                rent_type = html.xpath('//ul[@class="f14"]/li[1]/span[2]/text()')[0].split('-')[0]
                area = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0].split()[1]+"平"
                room_type = html.xpath('//ul[@class="f14"]/li[2]/span[2]/text()')[0].split()[0]
                addres = html.xpath('//ul[@class="f14"]/li[6]/span[2]/text()')[0].strip()
                traffic = str(html.xpath('//ul[@class="f14"]/li[5]/em/text()'))
                pictures = html.xpath('//ul[@id="housePicList"]/li/img/@lazy_src')  
                house_description = str(html.xpath('//ul[@class="introduce-item"]/li[2]/span[@class="a2"]//text()')).strip()
                print(phone)
    

    if name == 'main': crawl()

    1 条回复    2018-06-19 20:24:34 +08:00
    U87
        1
    U87  
    OP
       2018-06-19 20:24:34 +08:00
    还是出现 requests.exceptions.ProxyError 难道是在验证和爬目标网站这时间之间 ip 失效了?
    关于   ·   帮助文档   ·   博客   ·   API   ·   FAQ   ·   实用小工具   ·   1086 人在线   最高记录 6679   ·     Select Language
    创意工作者们的社区
    World is powered by solitude
    VERSION: 3.9.8.5 · 22ms · UTC 22:36 · PVG 06:36 · LAX 14:36 · JFK 17:36
    Developed with CodeLauncher
    ♥ Do have faith in what you're doing.