Python的selenium利用Chrome的debugger模式采集小红书列表页

当前位置：首页 > Python笔记 > 正文内容

Python的selenium利用Chrome的debugger模式采集小红书列表页

wang
2024-03-01
Python笔记
112浏览
0评论

import hashlib
import time
import random

import pymysql
from pymysql.converters import escape_string
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

#设置浏览器
options=Options()
options.add_experimental_option('debuggerAddress','127.0.0.1:9222')
bro=webdriver.Chrome(options=options)

def lst():
# 连接数据库
    conn_weixin = pymysql.connect(
host='122112',
        user='12122',
        password='121212',
        database='12122121',
        charset='utf8mb4',)
    cursor_weixin = conn_weixin.cursor(cursor=pymysql.cursors.DictCursor)
#随机找一个关键词
    sql = f'''select * from xiaohongshu_word where wangzhan<1111 order by RAND() limit 1 '''
    cursor_weixin.execute(sql)
    word = cursor_weixin.fetchone()
#更新关键词的采集次数
    sql = f'''update xiaohongshu_word set cishu=cishu+1 where id={word['id']}'''
    cursor_weixin.execute(sql)
    conn_weixin.commit()


    input=bro.find_element(By.ID,'search-input')
    time.sleep(1)
    x=random.randint(1, 10)
for i in range(11):
        time.sleep(x*0.1)
        input.send_keys("\ue003")
    bro.find_element(By.ID,'search-input').send_keys(word['word'])
    time.sleep(0.3)
    bro.find_element(By.ID,'search-input').send_keys('\ue007')
# 执行 JavaScript 滚动操作
    j = 100000
    n=0
    #下滑过程中获取全部的笔记 每一次下滑都要获取一次，因为他会消失

    for i in range(1, 333):
#下滑 并每一次都根据标题进行数据库比对插入
        js = "var q=document.documentElement.scrollTop={}".format(j - i * 100)  # 设置Top=0，一下到顶
        time.sleep(0.5)
        bro.execute_script(js)
        yuanma=bro.page_source
        content = BeautifulSoup(yuanma, "html.parser")
        res=content.find_all(name='section',class_="note-item")
for i in res:
print(i.text)
            shipin=i.find(name='span',class_="play-icon")
if shipin:
print('这个是视频，所以跳过')
else:
##print('这个不是视频,尝试进行数据插入')
                title = i.find(name='a', class_="title")
if not title:
print('xxxxxxxxxxxxxxxx没有标题，所以跳过')
continue
                else:
                    title = i.find(name='a', class_="title").text
#print(title)
                title_md5 = hashlib.md5(title.encode()).hexdigest()
#根据md5去重
                sql = f" select * from xiaohongshu_url where title_md5='{title_md5}' limit 1"
                cursor_weixin.execute(sql)
                you = cursor_weixin.fetchone()
if you :
print('√√√√√√√√√√√√√√√√√√数据已存在，所以跳过')
continue
                gzh=i.find(name='span',class_="name").text
#print(gzh)
                url=i.find(name='a', class_="title")
#print(url)
                url=url.get('href')
#print(url)
                sql = f'''insert into xiaohongshu_url 
                            (word,wangzhan_id,time_ruku,time_yuanfabu,url,title,title_md5,gzh,status,caozuocishu,fid) values
                            
                            ('{word['word']}','{word['wangzhan']}','{int(time.time())}',0,'{url}','{escape_string(title)}','{title_md5}','{escape_string(gzh)}',0,1,{word['fid']})'''
                #print(sql)
                cursor_weixin.execute(sql)
                conn_weixin.commit()
print('■■■■■■■■■■■■■■■■■■■■■■■■■■插入数据库成功')
                sql = f'''update xiaohongshu_word set chenggong=chenggong+1 where id={word['id']}'''
                cursor_weixin.execute(sql)
                conn_weixin.commit()


#判断是否已经到最底部
        if 'THE END' in yuanma:
print('到底了啊啊啊啊啊啊啊啊啊啊啊')
break
        n = n + 1
        print('一共下滑滚轴的次数是：',n)
#bs4搞接下来的事情
    cursor_weixin.close()
    conn_weixin.close()

while 1 > 0 :
    lst()