PyCurl设置HTTP代理示例

十点数据 1年前 ⋅ 2608 阅读
import pycurl , time, traceback
from com.fy.utils.html.HtmlCode import HtmlCodeUtils
from com.fy.utils.date.DateUtils import Date_Utils
from io import BytesIO  # <-- 这个用到里面的write函数
from gevent import monkey; 
monkey.patch_all()#把当前程序的所有IO操作给我单独的做上标记
#如果没有给gevent打上补丁的话,它是检测不到除gevent它本省自带的IO操作的,当打上了补丁,它就能检		测到程序其他所有的IO操作
import  gevent , certifi
from com.fy.utils.file.FileUtils import File_Utils
from com.fy.utils.http.UserAgentUtils import UserAgentUtils
from com.fy.plugs.proxi.ProxiService import ProxiService
ps = ProxiService()
class GeventPyCurlPlugs:
    def __init__(self):
        self.hcu = HtmlCodeUtils() #根据HTML源码获取网页编码
        self.c = pycurl.Curl() 
        self.du = Date_Utils()
        self.ua = UserAgentUtils()
        #设置链接超时,设置下载超时
        self.c.setopt(pycurl.CONNECTTIMEOUT, 120)#链接超时
        self.c.setopt(pycurl.TIMEOUT, 120) #下载超时
        self.c.setopt(pycurl.MAXREDIRS, 5)#设置重定向次数
        self.c.setopt(pycurl.CAINFO, certifi.where())
    
    #对于某些采用HTTPS的网站,有时会因为证书验证失败而无法正常访问,pycurl模块提供了取消验证过程的功能。
    self.c.setopt(pycurl.SSL_VERIFYHOST, False)
    self.c.setopt(pycurl.SSL_VERIFYPEER, False)
    self.proxy = {}
    
def geventPycControl(self, tasks, urlFieldName): 
    print("待处理的任务【" + str(len(tasks)) + "】个...")
    start_time = time.time()
    self.urlFieldName = urlFieldName#集合中存放地址的key
    jobs = [gevent.spawn(self.fetch_html_pycurl, task) for task in tasks]
    htmls = gevent.joinall(jobs, timeout=120, raise_error=False)
    end_time = time.time()
    print("下载页面【完毕】,共历时:【" + str((end_time - start_time))[0:4] + "】s.....\n")
    del htmls
    return tasks, (end_time - start_time)
    
def fetch_html_pycurl(self, task):
    host = task[self.urlFieldName]#待处理的地址
    html = ""#HTML源码
    requestCode = 200#返回的状态码
    htmlCode = urlNew = ""
    try:
        #模拟浏览器
        self.c.setopt(pycurl.USERAGENT, self.ua.getheaders())
        self.c.setopt(pycurl.URL, host)

代理设置

         if 'proxy_host' in self.proxy.keys():
            self.c.setopt(pycurl.PROXY, self.proxy['proxy_host'])
            if 'proxy_port' in self.proxy.keys():
                self.c.setopt(pycurl.PROXYPORT, int(self.proxy['proxy_port']))
            if 'proxy_user' in self.proxy.keys():
                self.c.setopt(pycurl.PROXYUSERPWD, "%(proxy_user)s:%(proxy_pass)s" % self.proxy)
        print(self.du.getCurrentTimeStr(), "设置代理IP【完毕】.....")


		self.c.setopt(self.c.HTTPHEADER, ["Accept: application/vnd.crossref.unixsd+xml"]) #设置header
        e = BytesIO()
        self.c.setopt(self.c.WRITEFUNCTION, e.write)#将返回的header写入BytesIO
        self.c.setopt(self.c.FOLLOWLOCATION, 1) #是否重定向到url,0表示关闭重定向,1表示开启重定向
        self.c.perform() # 执行
        data = e.getvalue();
        urlNew = self.c.getinfo(pycurl.EFFECTIVE_URL)# 获取最新的url,如果重定向,则返回跳转后的url
        try:htmlCode = task["htmlCode"].strip()
        except:htmlCode = ""
        if htmlCode == "automatic"  or len(htmlCode) < 3 :
            htmlCode = self.hcu.getChardet(data)#获取网页编码
        html = str(data.decode(htmlCode, 'ignore'))#返回byte,需要进行decode,如果 c.setopt(pycurl.FOLLOWLOCATION, 1)设置为1,则返回跳转后的全文 
        
        requestCode = self.c.getinfo(self.c.HTTP_CODE)
        if html != None:print(htmlCode, "HTML-length:", len(html), "HTTP_CODE:", requestCode, "request-url:", host)
        else:print(htmlCode, "HTML-length:", 0, "HTTP_CODE:", requestCode, "request-url:", host)
    except :
        requestCode = 400
        print("加载链接为:", task["id"], host, traceback.print_exc())
    fu = File_Utils('./html.txt')
    fu.write_utils(html, False, 'utf8')
    task['urlNew'] = urlNew#重定向后的url地址;
    task['log_loading_html'] = html#HTML源码
    task['log_loading_html_code'] = htmlCode#HTML编码
    task['log_loading_status'] = requestCode#返回的状态码
    task["log_loading_html_length"] = len(html)
    task["log_loading_etime"] = self.du.getCurrentTimeLong()
    return task

最后效果:

pycurl.png

全部评论: 0

    我有话说: