爬虫系列之基于模拟点击的微信PC端关键词搜索采集

十点数据 1年前 ⋅ 3345 阅读
完整代码如下:
'''
Created on Dec 10, 2019

@author: admin
'''
import time, random, pyperclip , traceback
from pymouse import PyMouse
from pykeyboard import PyKeyboard
from com.fy.utils.date.DateUtils import Date_Utils
from com.fy.utils.http.HttpUtils import HttpUtils
from com.fy.utils.file.FileUtils import File_Utils
from com.fy.utils.hash.HashUtils import Hash_Utils
class WeChatMouseInfo:
def __init__(self):
    self.du = Date_Utils()
    self.du = Date_Utils()
    self.pm = PyMouse()
    self.kb = PyKeyboard()
    self.httpUtil = HttpUtils()
    self.hash = Hash_Utils()
    self.datas = []

#打开搜索界面,并执行搜索操作;
def control(self, kw, isTheme:"是否是主题采集。主题采集是的排序方式为默认;关键词为发布时间"):
    print(self.du.getCurrentTimeStr()[11:] + "    点击微信快捷方式\n")
    self.pm.click(575, 755)#点击微信快捷方式
    time.sleep(2)
    
    print(self.du.getCurrentTimeStr()[11:] + "    点击微信搜索框\n")
    self.pm.click(122, 39)#点击微信搜索框
    time.sleep(2)
    
    print(self.du.getCurrentTimeStr()[11:] + "    点击搜索框下的【文章搜索】\n")
    self.pm.click(156, 173)#点击搜索框下的文章搜索
    time.sleep(2)
    self.pm.click(105, 12)#微信主界面置前
    time.sleep(2)
    self.pm.click(800, 15)#关闭微信主界面
    time.sleep(2)
    
    print(self.du.getCurrentTimeStr()[11:] + "    点击输入区,把鼠标置于输入状态\n")
    self.pm.click(500, 345)#点击输入区,把鼠标置于输入状态;
    time.sleep(2)
    
    pyperclip.copy(kw)

    #以下语句模拟键盘点击ctrl+v
    print(self.du.getCurrentTimeStr()[11:] + "    模拟键盘点击ctrl+v\n")
    self.kb.press_key(self.kb.control_key)
    self.kb.tap_key('v')
    self.kb.release_key(self.kb.control_key)
    time.sleep(2)
    
    self.kb.tap_key(self.kb.enter_key)#回车,进行搜索
    print(self.du.getCurrentTimeStr()[11:] + "    按下回车键,进行搜索\n")
    time.sleep(5)

    if isTheme:
        self.pm.click(780, 153)#点击相关度排序
        print(self.du.getCurrentTimeStr()[11:] + "    点击【相关度排序】完毕\n")
    else:
        self.pm.click(885, 153)#点击时间排序
        print(self.du.getCurrentTimeStr()[11:] + "    点击【时间排序】完毕\n")
    print(self.du.getCurrentTimeStr()[11:] + "    +++++++++++++打开搜索界面,并执行搜索操作完毕+++++++++++++\n")
    time.sleep(4)
 
#处理搜索结果列表
def crawlerSearchResult(self):
    print(self.du.getCurrentTimeStr()[11:] + "    +++++++++++++解析搜索结果列表++++++++【开始】+++++\n")
    for high in range(0, 5) :
        moveHigh = 233 + 111 * high
        print(self.du.getCurrentTimeStr()[11:] + "    点击第【", high + 1, "】条信息")
        self.pm.click(660, moveHigh)# 点击一条信息,加载一条信息的详细页;
        time.sleep(6)
        title, url, author = self.parseInfo()#获取信息URL地址;
        self.pm.click(1352, 14)# 关闭信息详情页
        if url != None  and  title != None :
            data = {}
            data["title"] = title#标题
            data["url"] = url
            data["author"] = self.hash.getMd5HashUtils(author + "#" + title)   #获取url中的Signature参数,作为排重字段;
            data['dicName'] = "Cralwer_WeChat_List"#缓存库名称;
            data["keyField"] = "author"#由于搜狗微信的链接地址不断变化,暂时用公众号加标题进行排重;
            self.datas.append(data)
        print()
    print(self.du.getCurrentTimeStr()[11:] + "    +++++++++++++解析搜索结果列表++++++++【完毕】+++++\n")

#关闭搜索窗口
def closeSearchPanle(self):
    print(self.du.getCurrentTimeStr()[11:] + "    关闭搜索结果页\n")
    self.pm.click(988, 25)#点击关闭搜索窗口

def parseInfo(self):
    try:
        print(self.du.getCurrentTimeStr() + "    点击微信快捷方式\n")
        self.pm.click(180, 56)#点击微信快捷方式
        time.sleep(2)
        url = pyperclip.paste()
        
        self.pm.click(180, 120)#点击微信快捷方式
        time.sleep(2)
        self.kb.press_keys([self.kb.control_r_key, 'a'])#全选
        time.sleep(2)
        self.kb.press_keys([self.kb.control_r_key, 'c'])#复制;
        time.sleep(2)
        text = pyperclip.paste()
        title = text.split("\n")[0].strip()
        author = self.getAuthor(text)
        print(self.du.getCurrentTimeStr()[11:] + "    标题:", title)
        print(self.du.getCurrentTimeStr()[11:] + "    链接:", url)
        print(self.du.getCurrentTimeStr()[11:] + "    作者:", author)
        return title, url, author
    except:
        print(traceback.print_exc())
        return None, None, None

#从内容中解析出服务号名称;
def getAuthor(self, text):
    author = ""
    lineNumber = 0
    for line in text.split("\n"):
        line = line. strip()
        if len(line) > 2:
            lineNumber += 1
            if lineNumber == 2:     
                line = line.replace("  ", " ")
                lines = line.split(" ")
                index = len(lines)
                author = lines[index - 2]
    return author

#保存采集的数据;
def saveDatas(self, datas):
    url = "http://XXX.XXX.XXX.XXX:XXXX/hashInter/addListHashRepeat?isChecked=false" #保存结果数据接口
    try:
        result = self.httpUtil.post(url, datas)#具有URL排重功能
        if result == None:
            time.sleep(5)
            result = self.httpUtil.post(url , datas)#具有URL排重功能
        print(result, "\n\n")
    except:pass
    
if __name__ == '__main__':
owc = WeChatMouseInfo()
fu = File_Utils("./kw.txt")
while 1:
    kws = []
    theme = []
    owc.datas = []
    for kw in fu.read_To_List("utf-8"):
        kws.append(kw)
    startTime = owc.du.getCurrentTimeLong()
    for kw in kws:
        owc.control(kw, False)#打开搜索界面,并执行搜索操作;
        owc.crawlerSearchResult()#解析列表信息;
        owc.closeSearchPanle()#关闭搜索窗口
        for data in owc.datas:
            theme.append(data["title"]) 
        endTime = owc.du.getCurrentTimeLong()
        print(owc.du.getCurrentTimeStr()[11:] + "    已获取数据【" + str(len(owc.datas)) + "】条,历时【" + str(endTime - startTime) + "】秒\n")
        time.sleep(100)

    #-------以下是把关键词搜索到的信息标题,作为关键词,进行二次主题搜索采集        
    for tkw in theme:
        owc.control(tkw, False)#打开搜索界面,并执行搜索操作;
        owc.crawlerSearchResult()#解析列表信息;
        owc.closeSearchPanle()#关闭搜索窗口
        endTime = owc.du.getCurrentTimeLong()
        print(owc.du.getCurrentTimeStr()[11:] + "    已获取数据【" + str(len(owc.datas)) + "】条,历时【" + str(endTime - startTime) + "】秒\n")
    
    #owc.saveDatas(owc.datas)#保存解析的搜索结果数据.
    num = random.randint(180, 540)
    print(owc.du.getCurrentTimeStr() + "    " + str(num) + " seconds later for the nex round of processing....")
    time.sleep(num)

全部评论: 0

    我有话说: