【hifini】采集,无损音乐下载
Published on
发现个宝藏网站,可以下载无损音乐
然后发现css混淆。 判断一下 inline 绕过。
代码附在下面。 注:填写vip cookie才能排量采集
代码附在下面。 注:填写vip cookie才能排量采集
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
@author: chuxia
@contact: 676463@qq.com
@blog: blog.teqiyi.com
@file: main.py
@time: 2023/5/23 2:52 PM
'''
import os
import time
import requests
from lxml import etree
s = requests.Session()
s.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'cookie': '输入你的cookie'
})
baseUrl = 'https://hifini.com/'
def getStyle(text, style):
a = True
inx = style.find(text)
while a:
t = style[inx + len(text)]
if t != '{':
inx += 1
else:
a = False
inx_ = inx
inx = style.find('}', inx)
if style[inx_:inx + 1].find('inline') != -1:
return True
return False
def save(title, ids, a, pwd):
fileName = './file/' + ids + '.txt'
# 判断文件是否存在
if os.path.exists(fileName):
return
with open(fileName, 'w') as f:
f.write(title + '----' + a + '----' + pwd)
def getPageInfo(title, path):
global s
url = baseUrl + path
print(url)
# url = 'https://hifini.com/thread-11382.htm'
ids = url.split('-')[1].split('.')[0]
res = s.get(url).text
html = etree.HTML(res)
try:
style_xpath = '//*[@id="body"]/div/style'
style = html.xpath(style_xpath)
style = ','.join([s.xpath('./text()')[0] for s in style])
span = html.xpath("//*[@class='alert alert-success'][2]/span")
a = html.xpath("//*[@class='alert alert-success'][2]/a/@href")[0]
pwd = ''.join([i.xpath('./text()')[0] for i in span if getStyle(i.xpath('./@class')[0], style)])
if a != '' and pwd != '' and a.find('lanzou') != -1:
print(title)
print(ids)
print(a)
print(pwd)
save(title, ids, a, pwd)
except Exception as e:
print(title + '采集失败' + '----' + str(e))
def getPageList():
page = 1
while page < 2470:
url = 'https://hifini.com/forum-1-%s.htm?orderby=lastpid' % page
res = s.get(url).text
html = etree.HTML(res)
title_xpath = "//*[@class='subject break-all']"
title_list = html.xpath(title_xpath)
for title in title_list:
if title.xpath('./i'):
continue
title_text = title.xpath('./a//text()')[0]
uri = title.xpath('./a/@href')[0]
ids = uri.split('-')[1].split('.')[0]
fileName = './file/' + ids + '.txt'
# 判断文件是否存在
if os.path.exists(fileName):
continue
getPageInfo(title_text, uri)
print('-----------------')
print('---------------%s页采集完成---------------' % page)
page += 1
time.sleep(5)
if __name__ == '__main__':
getPageList()
初夏大大
2023-09-15
Linux
Google Chrome
试试 
