# encoding=utf-8
'''
Created on 2015-11-23
@author: fy
'''
import pycurl , time, traceback
#如果没有给gevent打上补丁的话,它是检测不到除gevent它本省自带的IO操作的,当打上了补丁,它就能检测到程序其他所有的IO操作
from com.fy.utils.html.HtmlCode import HtmlCodeUtils
from com.fy.utils.date.DateUtils import Date_Utils
from io import BytesIO # <-- 这个用到里面的write函数
from gevent import monkey;
monkey.patch_all()#把当前程序的所有IO操作给我单独的做上标记
#如果没有给gevent打上补丁的话,它是检测不到除gevent它本省自带的IO操作的,当打上了补丁,它就能检测到程序其他所有的IO操作
import gevent , certifi
from com.fy.utils.file.FileUtils import File_Utils
from com.fy.utils.http.UserAgentUtils import UserAgentUtils
class GeventPyCurlPlugs:
def __init__(self):
self.hcu = HtmlCodeUtils() #根据HTML源码获取网页编码
self.c = pycurl.Curl()
self.du = Date_Utils()
self.ua = UserAgentUtils()
#设置链接超时,设置下载超时
self.c.setopt(pycurl.CONNECTTIMEOUT, 120)#链接超时
self.c.setopt(pycurl.TIMEOUT, 120) #下载超时
self.c.setopt(pycurl.MAXREDIRS, 5)#设置重定向次数
self.c.setopt(pycurl.CAINFO, certifi.where())
#对于某些采用HTTPS的网站,有时会因为证书验证失败而无法正常访问,pycurl模块提供了取消验证过程的功能。
self.c.setopt(pycurl.SSL_VERIFYHOST, False)
self.c.setopt(pycurl.SSL_VERIFYPEER, False)
def geventPycControl(self, tasks, urlFieldName):
print("待处理的任务【" + str(len(tasks)) + "】个...")
start_time = time.time()
self.urlFieldName = urlFieldName#集合中存放地址的key
jobs = [gevent.spawn(self.fetch_html_pycurl, task) for task in tasks]
htmls = gevent.joinall(jobs, timeout=120, raise_error=False)
end_time = time.time()
print("下载页面【完毕】,共历时:【" + str((end_time - start_time))[0:4] + "】s.....\n")
del htmls
return tasks, (end_time - start_time)
def fetch_html_pycurl(self, task):
host = task[self.urlFieldName]#待处理的地址
html = ""#HTML源码
requestCode = 200#返回的状态码
htmlCode = urlNew = ""
try:
#模拟浏览器
self.c.setopt(pycurl.USERAGENT, self.ua.getheaders())
self.c.setopt(self.c.URL, host)
self.c.setopt(self.c.HTTPHEADER, ["Accept: application/vnd.crossref.unixsd+xml"]) #设置header
e = BytesIO()
self.c.setopt(self.c.WRITEFUNCTION, e.write)#将返回的header写入BytesIO
self.c.setopt(self.c.FOLLOWLOCATION, 1) #是否重定向到url,0表示关闭重定向,1表示开启重定向
self.c.perform() # 执行
data = e.getvalue();
urlNew = self.c.getinfo(pycurl.EFFECTIVE_URL)# 获取最新的url,如果重定向,则返回跳转后的url
try:htmlCode = task["htmlCode"].strip()
except:htmlCode = ""
if htmlCode == "automatic" or len(htmlCode) < 3 :
htmlCode = self.hcu.getChardet(data)#获取网页编码
html = str(data.decode(htmlCode, 'ignore'))#返回byte,需要进行decode,如果 c.setopt(pycurl.FOLLOWLOCATION, 1)设置为1,则返回跳转后的全文
requestCode = self.c.getinfo(self.c.HTTP_CODE)
if html != None:print(htmlCode, "HTML-length:", len(html), "HTTP_CODE:", requestCode, "request-url:", host)
else:print(htmlCode, "HTML-length:", 0, "HTTP_CODE:", requestCode, "request-url:", host)
except :
requestCode = 400
print("加载链接为:", task["id"], host, traceback.print_exc())
fu = File_Utils('./html.txt')
fu.write_utils(html, False, 'utf8')
task['urlNew'] = urlNew#重定向后的url地址;
task['log_loading_html'] = html#HTML源码
task['log_loading_html_code'] = htmlCode#HTML编码
task['log_loading_status'] = requestCode#返回的状态码
task["log_loading_html_length"] = len(html)
task["log_loading_etime"] = self.du.getCurrentTimeLong()
return task