点击“Python人工智能编程”,选择“星标?”
在cmd创建
1.关闭机器人协议
2.取消禁用cookie的功能
现在就回到爬虫文件wyySpider.py准备前期的工作
修改start_urls里的网址和准备一个请求头
import scrapy
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
class WyyspiderSpider(scrapy.Spider):
name = 'wyySpider'
allowed_domains = ['163.com']
start_urls = ['https://music.163.com/playlist?id=19xxxxx7']
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
链接: https://pan.baidu.com/s/1M-gME2R8EEhEoFlPaDhbmA 提取码: 7iai
def getCookie(self):
# 获取谷歌的驱动,参数为刚刚驱动程序的位置
driver = webdriver.Chrome("C:/Users/Administrator/AppData/Local/Programs/Python38/Lib/site-packages/selenium/webdriver/chrome/chromedriver.exe")
# -----------------selenium自动登录-----------------------
# 打开谷歌然后访问指定的网站
driver.get("https://music.163.com/")
# 最大化,睡眠是怕网速慢没加载出来
driver.maximize_window()
time.sleep(1)
# 鼠标从(0,0)向x(1435px),y(35px)移动,用左键点击一下
ActionChains(driver).move_by_offset(1435, 35).click().perform()
time.sleep(0.3)
# 点击其他方式
ActionChains(driver).move_by_offset(-480, 575).click().perform()
time.sleep(0.3)
# 同意条款
ActionChains(driver).move_by_offset(-218, -10).click().perform()
time.sleep(0.3)
# 手机登录
ActionChains(driver).move_by_offset(107, -100).click().perform()
time.sleep(0.3)
# 输入账号密码
# 通过css选择器获取id为"p"的标签,然后send_keys就是模拟输入一些信息
driver.find_element_by_css_selector("#p").send_keys("账号")
driver.find_element_by_css_selector("#pw").send_keys("密码")
time.sleep(0.3)
# 点击登录
ActionChains(driver).move_by_offset(110, 15).click().perform()
time.sleep(1)
# 找到头像悬浮
img = driver.find_element_by_css_selector("div.head:nth-child(1) > img:nth-child(1)")
ActionChains(driver).move_to_element(img).perform()
time.sleep(0.5)
# 点击我的主页
ActionChains(driver).move_by_offset(0, 40).click().perform()
time.sleep(0.5)
# 点击喜欢的音乐
ActionChains(driver).move_by_offset(-870, 830).click().perform()
time.sleep(0.3)
# -----------------selenium自动登录-----------------------
[{'domain': 'music.163.com', 'expiry': 2147483647, 'httpOnly': False, 'name': 'WM_TID', 'path': '/', 'secure': False, 'value': 'UnQj6SSNqN9BEVdubmNcEjpl%2B9DA'}, {'domain': 'music.163.com', 'expiry': 2147483647, 'httpOnly': False, 'name': 'WM_NIKE', 'path': '/', 'secure': False, 'value': '9ca17ae2e6ffcda170e2e6ee87f4508ef58483ea4a97968ea7c54e879a8eaaf445aebc83b6e933f3f1c0b4c82af0fea7c3b92af697b7a6dc7b82afc09ad98ca695bc5082ecbcb1e772b7889b3d1c15bf28da0bbfb5b95aa8795f073adbc9c98ed79a28d8aa7f450f1ae9dd9b77a85edbf9ac625f1ef84d8f970b4e7bfd8cd21b48e8c8ec17df3e7a898f74488ef9bb5c837e2a3'}, {'domain': '.music.163.com', 'httpOnly': False, 'name': 'WNMCID', 'path': '/', 'sameSite': 'Strict', 'secure': False, 'value': 'fdygqk.1611989994304.01.0'}, {'domain': '.music.163.com', 'httpOnly': False, 'name': 'WEVNSM', 'path': '/', 'sameSite': 'Strict', 'secure': False, 'value': '1.0.0'}, {'domain': 'music.163.com', 'expiry': 2147483647, 'httpOnly': False, 'name': 'WM_NI', 'path': '/', 'secure': False, 'value': '6IyEYqBqpyZMITjt9DB4tPdzuXUFC%2BNyOiu3S04CTC5Nsv2Q4gkMM0BQ2SPZxQWvItmyodTwnsbSFFqD3rS84rG3qyG%2F31L7zdp9q7N%2BpRDmBw19hwtHD1UTE%3D'}, {'domain': '.music.163.com', 'expiry': 1927349994, 'httpOnly': False, 'name': 'NMTID', 'path': '/', 'secure': False, 'value': '00O-pWx8ZDJJQfiFkHzsgin07nYSmUAAAF3UhdN2w'}, {'domain': '.163.com', 'expiry': 4765589994, 'httpOnly': False, 'name': '_ntes_nuid', 'path': '/', 'secure': False, 'value': '738fc9cd89d6d8799fa76b3348d25d7d'}, {'domain': '.163.com', 'expiry': 4765589994, 'httpOnly': False, 'name': '_ntes_nnid', 'path': '/', 'secure': False, 'value': '738fc9cd89d6d8799fa76b3348d25d7d,1611989994150'}, {'domain': '.music.163.com', 'expiry': 1769671794, 'httpOnly': False, 'name': '_iuqxldmzr_', 'path': '/', 'secure': False, 'value': '32'}, {'domain': '.music.163.com', 'expiry': 1769671794, 'httpOnly': False, 'name': 'JSESSIONID-WYYY', 'path': '/', 'secure': False, 'value': 'OoCMxNwGV%5CfZD2OSzAXovf4ASVZsJ8UQ4sgg7JfH075cKTD%2FW3sMzZj%2BpayS1EnNVXzRm%2F2GxfzIoNv3FTjYxKeNFZWqf6UeiMSc1%2BG98kgsEM94juuE%5Cs18k2%2BPNPAp3hU0G%5CFDUtjkimCR5pgOIOI%3A1611991794102'}]
# 将driver获取的字典类型的cookie提取name和value封装成字符串
temp = []
for i in driver.get_cookies():
temp.append(i['name'] + "=" + i['value'])
# 返回字符串cookie
return ';'.join(temp)
def start_requests(self):
# 定义请求头的时候调用一下getCookie获取一下cookie
headers = {
'Cookie': self.getCookie(),
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
# 注意url是个列表这里拿下标[0],然后把headers请求头塞进去,交给parse函数
yield scrapy.Request(url=self.start_urls[0], headers=headers, callback=self.parse)
def parse(self, response):
# 匹配歌曲名的正则表达式
patt = re.compile(r'<a href="/song.id=.*?">([^<|{]*?)</a>')
# 找到所有歌曲名
listdata = re.findall(patt, response.text)
# 把数据写进txt文件
with open(file="../response.txt", mode="w+", encoding="utf-8") as file:
for item in listdata:
file.write(item+"n")
import scrapy
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
import re
class WyyspiderSpider(scrapy.Spider):
name = 'wyySpider'
allowed_domains = ['163.com']
start_urls = ['https://music.163.com/playlist?id=19xxxxx7']
def getCookie(self):
# 获取谷歌的驱动,参数为刚刚驱动程序的位置
driver = webdriver.Chrome("C:/Users/Administrator/AppData/Local/Programs/Python38/Lib/site-packages/selenium/webdriver/chrome/chromedriver.exe")
# -----------------selenium自动登录-----------------------
# 打开谷歌然后访问指定的网站
driver.get("https://music.163.com/")
# 最大化,睡眠是怕网速慢没加载出来
driver.maximize_window()
time.sleep(1)
# 以下坐标以自己的电脑为准
# 鼠标从(0,0)向x(1435px),y(35px)移动,用左键点击一下
ActionChains(driver).move_by_offset(1435, 35).click().perform()
time.sleep(0.3)
# 点击其他方式
ActionChains(driver).move_by_offset(-480, 575).click().perform()
time.sleep(0.3)
# 同意条款
ActionChains(driver).move_by_offset(-218, -10).click().perform()
time.sleep(0.3)
# 手机登录
ActionChains(driver).move_by_offset(107, -100).click().perform()
time.sleep(0.3)
# 输入账号密码
# 通过css选择器获取id为"p"的标签,然后send_keys就是模拟输入一些信息
driver.find_element_by_css_selector("#p").send_keys("账号")
driver.find_element_by_css_selector("#pw").send_keys("密码")
time.sleep(0.3)
# 点击登录
ActionChains(driver).move_by_offset(110, 15).click().perform()
time.sleep(1)
# 找到头像悬浮
img = driver.find_element_by_css_selector("div.head:nth-child(1) > img:nth-child(1)")
ActionChains(driver).move_to_element(img).perform()
time.sleep(0.5)
# 点击我的主页
ActionChains(driver).move_by_offset(0, 40).click().perform()
time.sleep(0.5)
# # 点击喜欢的音乐
# ActionChains(driver).move_by_offset(-870, 830).click().perform()
# time.sleep(0.3)
# -----------------selenium自动登录-----------------------
# 将driver获取的字典类型的cookie提取name和value封装成字符串
# 临时存放每个拼接好的key=value字符串
temp = []
# 遍历driver给的cookies字典
for i in driver.get_cookies():
temp.append(i['name'] + "=" + i['value'])
# 返回字符串cookie
return ';'.join(temp)
def start_requests(self):
# 定义请求头的时候调用一下getCookie获取一下cookie
headers = {
'Cookie': self.getCookie(),
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
# 注意url是个列表这里拿下标[0],然后把headers请求头塞进去,交给parse函数
yield scrapy.Request(url=self.start_urls[0], headers=headers, callback=self.parse)
def parse(self, response):
# 匹配歌曲名的正则表达式
patt = re.compile(r'<a href="/song.id=.*?">([^<|{]*?)</a>')
# 找到所有歌曲名
listdata = re.findall(patt, response.text)
# 把数据写进txt文件
with open(file="response.txt", mode="w+", encoding="utf-8") as file:
for item in listdata:
file.write(item+"n")
作者:我不是秃头哆唻咪 (侵删)
https://blog.csdn.net/weixin_44864260/article/details/113428996
原创文章,作者:栈长,如若转载,请注明出处:https://www.cxyquan.com/11290.html