Gevent结合PyCurl实践(升级篇)_爬虫

# encoding=utf-8
'''
Created on 2015-11-23

@author: fy
'''
import pycurl , time, traceback
#如果没有给gevent打上补丁的话，它是检测不到除gevent它本省自带的IO操作的，当打上了补丁，它就能检测到程序其他所有的IO操作
from com.fy.utils.html.HtmlCode import HtmlCodeUtils
from com.fy.utils.date.DateUtils import Date_Utils
from io import BytesIO  # <-- 这个用到里面的write函数
from gevent import monkey; 
monkey.patch_all()#把当前程序的所有IO操作给我单独的做上标记
#如果没有给gevent打上补丁的话，它是检测不到除gevent它本省自带的IO操作的，当打上了补丁，它就能检测到程序其他所有的IO操作
import  gevent , certifi
from com.fy.utils.file.FileUtils import File_Utils
from com.fy.utils.http.UserAgentUtils import UserAgentUtils
class GeventPyCurlPlugs:
    def __init__(self):
        self.hcu = HtmlCodeUtils() #根据HTML源码获取网页编码
        self.c = pycurl.Curl() 
        self.du = Date_Utils()
        self.ua = UserAgentUtils()
        #设置链接超时，设置下载超时
        self.c.setopt(pycurl.CONNECTTIMEOUT, 120)#链接超时
        self.c.setopt(pycurl.TIMEOUT, 120) #下载超时
        self.c.setopt(pycurl.MAXREDIRS, 5)#设置重定向次数
        self.c.setopt(pycurl.CAINFO, certifi.where())
    
    #对于某些采用HTTPS的网站，有时会因为证书验证失败而无法正常访问，pycurl模块提供了取消验证过程的功能。
    self.c.setopt(pycurl.SSL_VERIFYHOST, False)
    self.c.setopt(pycurl.SSL_VERIFYPEER, False)

def geventPycControl(self, tasks, urlFieldName): 
    print("待处理的任务【" + str(len(tasks)) + "】个...")
    start_time = time.time()
    self.urlFieldName = urlFieldName#集合中存放地址的key
    jobs = [gevent.spawn(self.fetch_html_pycurl, task) for task in tasks]
    htmls = gevent.joinall(jobs, timeout=120, raise_error=False)
    end_time = time.time()
    print("下载页面【完毕】，共历时：【" + str((end_time - start_time))[0:4] + "】s.....\n")
    del htmls
    return tasks, (end_time - start_time)
    
def fetch_html_pycurl(self, task):
    host = task[self.urlFieldName]#待处理的地址
    html = ""#HTML源码
    requestCode = 200#返回的状态码
    htmlCode = urlNew = ""
    try:
        #模拟浏览器
        self.c.setopt(pycurl.USERAGENT, self.ua.getheaders())
        self.c.setopt(self.c.URL, host) 
        self.c.setopt(self.c.HTTPHEADER, ["Accept: application/vnd.crossref.unixsd+xml"]) #设置header
        e = BytesIO()
        self.c.setopt(self.c.WRITEFUNCTION, e.write)#将返回的header写入BytesIO
        self.c.setopt(self.c.FOLLOWLOCATION, 1) #是否重定向到url，0表示关闭重定向，1表示开启重定向
        self.c.perform() # 执行
        data = e.getvalue();
        urlNew = self.c.getinfo(pycurl.EFFECTIVE_URL)# 获取最新的url，如果重定向，则返回跳转后的url
        try:htmlCode = task["htmlCode"].strip()
        except:htmlCode = ""
        if htmlCode == "automatic"  or len(htmlCode) < 3 :
            htmlCode = self.hcu.getChardet(data)#获取网页编码
        html = str(data.decode(htmlCode, 'ignore'))#返回byte，需要进行decode，如果 c.setopt(pycurl.FOLLOWLOCATION, 1)设置为1，则返回跳转后的全文 
        
        requestCode = self.c.getinfo(self.c.HTTP_CODE)
        if html != None:print(htmlCode, "HTML-length：", len(html), "HTTP_CODE：", requestCode, "request-url：", host)
        else:print(htmlCode, "HTML-length：", 0, "HTTP_CODE：", requestCode, "request-url：", host)
    except :
        requestCode = 400
        print("加载链接为：", task["id"], host, traceback.print_exc())
    fu = File_Utils('./html.txt')
    fu.write_utils(html, False, 'utf8')
    task['urlNew'] = urlNew#重定向后的url地址；
    task['log_loading_html'] = html#HTML源码
    task['log_loading_html_code'] = htmlCode#HTML编码
    task['log_loading_status'] = requestCode#返回的状态码
    task["log_loading_html_length"] = len(html)
    task["log_loading_etime"] = self.du.getCurrentTimeLong()
    return task
博主QQ

博主QQ：

博主微信

博主微信：

博主公号

博主公众号：

回到顶部

Gevent结合PyCurl实践(升级篇)

全部评论: 0 条

博主公众号: 博主微信:

热门文章

最新发布

最新评论