备案网站需要多久,代理浏览器,济南网站建设山东聚搜网见效快,网站建设写什么经营范围1.selenium自动化
selenium可以操作浏览器#xff0c;在浏览器页面上实现#xff1a;点击、输入、滑动 等操作。
不同于selenium自动化#xff0c;逆向本质是#xff1a;
分析请求#xff0c;例如#xff1a;请求方法、请求参数、加密方式等。用代码模拟请求去实现同等…1.selenium自动化
selenium可以操作浏览器在浏览器页面上实现点击、输入、滑动 等操作。
不同于selenium自动化逆向本质是
分析请求例如请求方法、请求参数、加密方式等。用代码模拟请求去实现同等功能。
逆向 vs 自动化Selenium
Selenium【优】简单不需要逆向只需要控制浏览器去执行预设的操作即可【缺点】性能差不利于批量实现逆向 【优】算法逆向出来后性能好且利于批量实现 【缺点】语法难搞的js加密算法不容易逆向
2.必备操作
2.1 模块 驱动 安装模块 pip install selenium下载驱动 Selenium想要控制谷歌、火狐、IE、Edage等浏览器必须要使用对应的驱动才行。【Selenium】-【驱动】-【浏览器】【Selenium】-【火狐驱动】-【火狐浏览器】【Selenium】-【谷歌驱动】-【谷歌浏览器】谷歌驱动的下载114及之前版本 http://chromedriver.storage.googleapis.com/index.html117/118/119版本 https://googlechromelabs.github.io/chrome-for-testing/浏览器版本的获取在谷歌浏览器上访问 chrome://version/ 例如119.0.6045.200 (正式版本) 64 位 (cohort: Stable) 快速使用 import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Serviceservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(https://passport.bilibili.com/login)time.sleep(5)
driver.close()2.2 寻找标签
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import Byservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(打开网址)# find_element find_elements
tag driver.find_element(By.ID, user)
tag driver.find_element(By.CLASS_NAME, c1)
tag driver.find_element(By.TAG_NAME, div)
tag driver.find_element(By.XPATH, /html/body/div[1]/div/div[2]/div[3]/div[3]/div/div/div/div[1]/span[2])
tag driver.find_element(By.XPATH, //*[idgeetest-wrap]//input[nametel])tag_list driver.find_elements(By.XPATH, /html/body/div/div[2]/div/div[2]/div/div[2]/div[2]/div/div/div/div/div[2]/a)
for tag in tag_list:print(tag)time.sleep(5)
driver.close()示例5xclass.cn
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import Byservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(https://www.5xclass.cn/)# 根据ID寻找
tag driver.find_element(By.ID, bs-example-navbar-collapse-1)
print(tag.text)
print(10 * -)# 根据类名寻找
tags driver.find_elements(By.CLASS_NAME, panel-heading)
for tag in tags:print(tag.text)
print(10 * -)# 根据标签名称寻找
tags driver.find_elements(By.TAG_NAME, li)
for tag in tags:print(tag.text)
print(10 * -)# 根据XPATH寻找
tag driver.find_element(By.XPATH, /html/body/div/div[2]/div/div[2]/div/div[2]/div[1])
print(tag.text)
print(10 * -)# 根据XPATH寻找
tag driver.find_element(By.XPATH, //*[idbs-example-navbar-collapse-1]/ul[1]/li[1]/a)
print(tag.text)
print(10 * -)# 根据XPATH寻找多个
tags driver.find_elements(By.XPATH, /html/body/div/div[2]/div/div[2]/div/div[2]/div[2]/div/div/div/div/div[2]/a)
for tag in tags:print(tag.text)
print(10 * -)# 根据父子关系嵌套寻找
parent driver.find_element(By.XPATH, /html/body/div/div[2]/div/div[2]/div/div[2]/div[2]/div/div/div/div)
tags parent.find_elements(By.XPATH, div[classcourse]/a)
for tag in tags:print(tag.text)time.sleep(5)
driver.close()2.3 执行操作
常见的执行操作点击、输入
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import Byservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(https://passport.bilibili.com/login)# 1.点击短信登录
time.sleep(3)
sms_btn driver.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[1]/div[3]
)
sms_btn.click() # 点击# 2.输入账号
phone_txt driver.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[2]/div[1]/div[1]/input
)
phone_txt.send_keys(18630087660) # 输入time.sleep(55)
driver.close()2.4 执行JavaScript
如果【选择标签】【执行操作】这种操作起来比较繁琐也可以直接在页面上去执行js代码实现功能。
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import Byservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(https://passport.bilibili.com/login)# ############# 1.点击短信登录 #############
time.sleep(3)
sms_btn driver.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[1]/div[3]
)
sms_btn.click()# ############# 2.输入账号 #############
phone_txt driver.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[2]/div[1]/div[1]/input
)
phone_txt.send_keys(18630087660)# ############# 3.选择国家 #############
time.sleep(2)
driver.execute_script(document.querySelector(.area-code-select).children[18].click())# ############# 4.读取cookie #############
data_string driver.execute_script(return document.cookie;) # return document.title;
print(data_string)# ############# 5.读取cookie #############
cookie_list driver.get_cookies()
print(cookie_list)time.sleep(2550)
driver.close()2.5 等待
如果页面加载比较慢需要等待某个元素加载成功后再执行某些操作。
示例1基于lambda表达式
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWaitservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(https://passport.bilibili.com/login)# ############# 方式1点击短信登录 #############
time.sleep(3)
sms_btn driver.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[1]/div[3]
)
sms_btn.click()# ############# 方式2点击短信登录推荐 #############
sms_btn WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[1]/div[3]
))
sms_btn.click()示例2自定义函数
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWaitservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(https://passport.bilibili.com/login)def func(dv):print(无返回值则间隔0.5s执行一次此函数如有返回值则复制给sms_btn变量)# div xxx123 iduuu/div# img src.../tag dv.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[1]/div[3])img_src tag.get_attribute(xxx)if img_src:return tagreturnsms_btn WebDriverWait(driver, 30, 0.5).until(func)
sms_btn.click()time.sleep(250)
driver.close()2.4 执行JavaScript
如果【选择标签】【执行操作】这种操作起来比较繁琐也可以直接在页面上去执行js代码实现功能。
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import Byservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(https://passport.bilibili.com/login)# ############# 1.点击短信登录 #############
time.sleep(3)
sms_btn driver.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[1]/div[3]
)
sms_btn.click()# ############# 2.输入账号 #############
phone_txt driver.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[2]/div[1]/div[1]/input
)
phone_txt.send_keys(18630087660)# ############# 3.选择国家 #############
time.sleep(2)
driver.execute_script(document.querySelector(.area-code-select).children[18].click())# ############# 4.读取cookie #############
data_string driver.execute_script(return document.cookie;) # return document.title;
print(data_string)# ############# 5.读取cookie #############
cookie_list driver.get_cookies()
print(cookie_list)time.sleep(2550)
driver.close()2.5 等待
如果页面加载比较慢需要等待某个元素加载成功后再执行某些操作。
示例1基于lambda表达式
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWaitservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(https://passport.bilibili.com/login)# ############# 方式1点击短信登录 #############
time.sleep(3)
sms_btn driver.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[1]/div[3]
)
sms_btn.click()# ############# 方式2点击短信登录推荐 #############
sms_btn WebDriverWait(driver, 30, 0.5).until(lambda dv: dv.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[1]/div[3]
))
sms_btn.click()示例2自定义函数
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWaitservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(https://passport.bilibili.com/login)def func(dv):print(无返回值则间隔0.5s执行一次此函数如有返回值则复制给sms_btn变量)# div xxx123 iduuu/div# img src.../tag dv.find_element(By.XPATH,//*[idapp]/div[2]/div[2]/div[3]/div[1]/div[3])img_src tag.get_attribute(xxx)if img_src:return tagreturnsms_btn WebDriverWait(driver, 30, 0.5).until(func)
sms_btn.click()time.sleep(250)
driver.close()示例3全局配置
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import Byservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)# 后续找元素时没找到时则等待10去寻找一旦找到则继续
driver.implicitly_wait(30)driver.get(https://passport.bilibili.com/login)sms_btn driver.find_element(By.XPATH,# //*[idapp]/div[2]/div[2]/div[3]/div[1]/div[3]//*[idxxxxxxxxxapp]/div[2]/div[2]/div[3]/div[1]/div[3]
)
sms_btn.click()
print(找到了)
time.sleep(250)
driver.close()2.6 获取值
当找到某个标签之后想要获取标签内部值。
示例1文本和属性
例如a idx1 classinfo mine href5xclass.cn武沛齐/a
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
service Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)
driver.implicitly_wait(10)driver.get(https://www.5xclass.cn)tag driver.find_element(By.XPATH,/html/body/div/div[2]/div/div[2]/div/div[2]/div[2]/div/div/div/div/div[2]/a[1]
)
print(tag.text)
print(tag.get_attribute(target))
print(tag.get_attribute(data-toggle))driver.close()示例2获取值
例如input typetext value? placeholder? /
例如select option value1北京/option /option value2上海/option /select 获取select标签的value属性
import timefrom selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import Byservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)
driver.implicitly_wait(10)driver.get(https://www.bilibili.com/)time.sleep(10)tag driver.find_element(By.XPATH,//*[idnav-searchform]/div[1]/input
)
print(tag)
print(tag.text)
print(tag.get_attribute(placeholder))
print(tag.get_attribute(value))time.sleep(1000)
driver.close()示例3选择相关
input typeradio namefindcar value1 checked新车
input typeradio namefindcar value2二手机import timefrom selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import Byservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)
driver.implicitly_wait(10)driver.get(https://www.autohome.com.cn/beijing/)# ############### 1.单独找到每一个 ###############
tag driver.find_element(By.XPATH,/html/body/div[1]/div[11]/div[2]/div[1]/div[1]/label[1]/span/input
)
print(tag.get_property(checked)) # Truetag driver.find_element(By.XPATH,/html/body/div[1]/div[11]/div[2]/div[1]/div[1]/label[2]/span/input
)
print(tag.get_property(checked)) # False# ############### 2.循环找到每一个 ###############
parent driver.find_element(By.XPATH,/html/body/div[1]/div[11]/div[2]/div[1]/div[1]
)tag_list parent.find_elements(By.XPATH,label/span/input
)
for tag in tag_list:print( tag.get_property(checked), tag.get_attribute(value) )driver.close()2.7 源码bs4
打开页面后如果基于selenium不太容易定位和寻找也可以结合bs4来进行寻找。
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoupservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)
driver.implicitly_wait(10)driver.get(https://car.yiche.com/)html_string driver.page_sourcesoup BeautifulSoup(html_string, featureshtml.parser)
tag_list soup.find_all(namediv, attrs{class: item-brand})
for tag in tag_list:child tag.find(namediv, attrs{class: brand-name})print(child.text)driver.close()2.8 携带Cookie
driver.add_cookie({name: foo, value: bar})import timefrom selenium import webdriver
from selenium.webdriver.chrome.service import Serviceservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)# 注意一定要先访问不然Cookie无法生效
driver.get(https://dig.chouti.com/about)# 加cookie
driver.add_cookie({name: token,value: eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiJjZHVfNDU3OTI2NDUxNTUiLCJleHBpcmUiOiIxNzA0MzI5NDY5OTMyIn0.8n_tWcEHXsBSXWIY9rBoGWwaLPF8iWIruryhKTe5_ks
})# 再访问
driver.get(https://dig.chouti.com/)time.sleep(2000)
driver.close()2.9 IP检测和代理
如果网站进行了IP访问限制例如每个IP每天只能操作5次。此时可以选择购买IP然后在请求时添加代理IP即可具体步骤
购买IP登录购买IP渠道的后台配置自己IP白名单代码携带代理
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service# 换成自己生成的代理
res requests.get(urlhttps://dps.kdlapi.com/api/getdps/?secret_ido60wwtxvs5ukaqqz18ainum1signaturei6s9shfjfiogat5ijecbyfwwc5grwrzjpt1formatjsonsep1)
proxy_string res.json()[data][proxy_list][0]
print(f获取代理{proxy_string}) # 182.106.136.218:40192service Service(driver/chromedriver.exe)opt webdriver.ChromeOptions()
# opt.add_argument(f--proxy-server222.89.70.40:40001) # 代理
opt.add_argument(f--proxy-server{proxy_string}) # 代理
driver webdriver.Chrome(serviceservice, optionsopt)driver.get(https://myip.ipip.net/)time.sleep(2000)
driver.close()2.10 特征检测
有些网站为了防止selenium会检测特征并禁止访问。
如果想要正常使用selenium访问那就需要隐藏浏览器相关的特征。
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Serviceservice Service(driver/chromedriver.exe)opt webdriver.ChromeOptions()opt.add_argument(--disable-infobars)
opt.add_experimental_option(excludeSwitches, [enable-automation])
opt.add_experimental_option(useAutomationExtension, False)driver webdriver.Chrome(serviceservice, optionsopt)# Selenium在打开任何页面之前先运行这个Js文件。
with open(driver/hide.js) as f:driver.execute_cdp_cmd(Page.addScriptToEvaluateOnNewDocument, {source: f.read()})driver.get(https://www.5xclass.cn)time.sleep(2000)
driver.close()2.11 无头和其他
如果不想显示展示在浏览器上的操作只想偷偷的在后台运行。
opt.add_argument(--headless)import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Serviceservice Service(driver/chromedriver.exe)
opt webdriver.ChromeOptions()
opt.add_argument(--headless)
driver webdriver.Chrome(serviceservice, optionsopt)driver.get(https://www.5xclass.cn)
tag driver.find_element(By.XPATH,/html/body/div/div[2]/div/div[2]/div/div[2]/div[2]/div/div/div/div/div[2]/a[1]
)
print(tag.text)
print(tag.get_attribute(target))
print(tag.get_attribute(data-toggle))driver.close()其他配置
opt.add_argument(--disable-infobars) # 禁止策略化
opt.add_argument(--no-sandbox) # 解决DevToolsActivePort文件不存在的报错
opt.add_argument(window-size1920x3000) # 指定浏览器分辨率
opt.add_argument(--disable-gpu) # 谷歌文档提到需要加上这个属性来规避bug
opt.add_argument(--incognito) # 隐身模式无痕模式
opt.add_argument(--disable-javascript) # 禁用javascript
opt.add_argument(--start-maximized) # 最大化运行全屏窗口,不设置取元素会报错
opt.add_argument(--hide-scrollbars) # 隐藏滚动条, 应对一些特殊页面
opt.add_argument(langen_US) # 设置语言
opt.add_argument(blink-settingsimagesEnabledfalse) # 不加载图片, 提升速度
opt.add_argument(User-AgentMozilla/5.0 (Linux; U; Androi....) # 设置User-Agent
opt.binary_location rC:\Program Files (x86)\Google\Chrome\Application\chrome.exe # 手动指定使用的浏览器位置2.12 截屏
找到某个标签后可以通过截图的形式保存图片。
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Serviceservice Service(driver/chromedriver.exe)
driver webdriver.Chrome(serviceservice)driver.get(https://www.5xclass.cn)
tag driver.find_element(By.XPATH,/html/body/div/div[2]/div/div[2]/div/div[2]
)# 截图保存
tag.screenshot(demo.png)# 截图图片内容
body tag.screenshot_as_png
print(body)# 截图Base64编码格式图片内容
b64_body tag.screenshot_as_base64
print(b64_body)driver.close()3.案例x东搜索
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service# 换成自己生成的代理
res requests.get(urlhttps://dps.kdlapi.com/api/getdps/?secret_ido60wwtxvs5ukaqqz18ainum1signaturei6s9shfjfiogat5ijecbyfwwc5grwrzjpt1formatjsonsep1)
proxy_string res.json()[data][proxy_list][0]
print(f获取代理{proxy_string})service Service(driver/chromedriver.exe)
opt webdriver.ChromeOptions()opt.add_argument(f--proxy-server{proxy_string}) # 代理
opt.add_argument(blink-settingsimagesEnabledfalse) # 不加载图片opt.add_argument(--disable-infobars)
opt.add_experimental_option(excludeSwitches, [enable-automation])
opt.add_experimental_option(useAutomationExtension, False)driver webdriver.Chrome(serviceservice, optionsopt)driver.implicitly_wait(10)with open(driver/hide.js) as f:driver.execute_cdp_cmd(Page.addScriptToEvaluateOnNewDocument, {source: f.read()})# 1.打开京东
driver.get(https://www.jd.com/)# 2.搜索框输入
tag driver.find_element(By.XPATH,//*[idkey]
)
tag.send_keys(iphone手机)# 3.点击搜索
tag driver.find_element(By.XPATH,//*[idsearch]/div/div[2]/button
)
tag.click()# 4.查询列表
tag_list driver.find_elements(By.XPATH,//*[idJ_goodsList]/ul/li
)
for tag in tag_list:# title tag.find_element(By.XPATH, div/div[classp-name p-name-type-2]//em).texttitle tag.find_element(By.XPATH, div/div[classp-name p-name-type-2]/a/em).textprint(title)driver.close()4.案例x麦网
import timeimport requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service# 换成自己生成的代理
res requests.get(urlhttps://dps.kdlapi.com/api/getdps/?secret_ido60wwtxvs5ukaqqz18ainum1signaturei6s9shfjfiogat5ijecbyfwwc5grwrzjpt1formatjsonsep1)
proxy_string res.json()[data][proxy_list][0]
print(f获取代理{proxy_string})service Service(driver/chromedriver.exe)
opt webdriver.ChromeOptions()
opt.add_argument(f--proxy-server{proxy_string}) # 代理
opt.add_argument(blink-settingsimagesEnabledfalse)
opt.add_argument(--disable-infobars)
opt.add_experimental_option(excludeSwitches, [enable-automation])
opt.add_experimental_option(useAutomationExtension, False)
driver webdriver.Chrome(serviceservice, optionsopt)
driver.implicitly_wait(10)
with open(driver/hide.js) as f:driver.execute_cdp_cmd(Page.addScriptToEvaluateOnNewDocument, {source: f.read()})# 1.打开大麦网
driver.get(https://www.damai.cn/)# 2.搜索框输入
tag driver.find_element(By.XPATH,//input[classinput-search]
)
tag.send_keys(周杰伦)# 3.点击搜索
tag driver.find_element(By.XPATH,//div[classbtn-search]
)
tag.click()# 4.查询列表
tag_list driver.find_elements(By.XPATH,//div[classsearch__itemlist]//div[classitems]
)
for tag in tag_list:title tag.find_element(By.XPATH, div[classitems__txt]/div[1]/a).textprint(title)time.sleep(2000)
driver.close()
如果不加代理访问频繁时会提示验证码