【hifini】采集，无损音乐下载 | 初夏博客 - 奇异网络

发现个宝藏网站，可以下载无损音乐

开通vip 发现只能单个下载。遂只能开始批量采集了。

然后发现css混淆。判断一下 inline 绕过。
代码附在下面。 注：填写vip cookie才能排量采集

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
@author: chuxia
@contact: 676463@qq.com
@blog: blog.teqiyi.com
@file: main.py
@time: 2023/5/23 2:52 PM
'''
import os
import time

import requests
from lxml import etree

s = requests.Session()
s.headers.update({
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
    'cookie': '输入你的cookie'
})
baseUrl = 'https://hifini.com/'


def getStyle(text, style):
    a = True
    inx = style.find(text)
    while a:
        t = style[inx + len(text)]
        if t != '{':
            inx += 1
        else:
            a = False
    inx_ = inx
    inx = style.find('}', inx)
    if style[inx_:inx + 1].find('inline') != -1:
        return True
    return False


def save(title, ids, a, pwd):
    fileName = './file/' + ids + '.txt'
    # 判断文件是否存在
    if os.path.exists(fileName):
        return
    with open(fileName, 'w') as f:
        f.write(title + '----' + a + '----' + pwd)


def getPageInfo(title, path):
    global s
    url = baseUrl + path
    print(url)
    # url = 'https://hifini.com/thread-11382.htm'
    ids = url.split('-')[1].split('.')[0]
    res = s.get(url).text
    html = etree.HTML(res)
    try:
        style_xpath = '//*[@id="body"]/div/style'
        style = html.xpath(style_xpath)
        style = ','.join([s.xpath('./text()')[0] for s in style])
        span = html.xpath("//*[@class='alert alert-success'][2]/span")
        a = html.xpath("//*[@class='alert alert-success'][2]/a/@href")[0]
        pwd = ''.join([i.xpath('./text()')[0] for i in span if getStyle(i.xpath('./@class')[0], style)])
        if a != '' and pwd != '' and a.find('lanzou') != -1:
            print(title)
            print(ids)
            print(a)
            print(pwd)
            save(title, ids, a, pwd)
    except Exception as e:
        print(title + '采集失败' + '----' + str(e))


def getPageList():
    page = 1
    while page < 2470:
        url = 'https://hifini.com/forum-1-%s.htm?orderby=lastpid' % page
        res = s.get(url).text
        html = etree.HTML(res)
        title_xpath = "//*[@class='subject break-all']"
        title_list = html.xpath(title_xpath)
        for title in title_list:
            if title.xpath('./i'):
                continue
            title_text = title.xpath('./a//text()')[0]
            uri = title.xpath('./a/@href')[0]
            ids = uri.split('-')[1].split('.')[0]
            fileName = './file/' + ids + '.txt'
            # 判断文件是否存在
            if os.path.exists(fileName):
                continue
            getPageInfo(title_text, uri)
            print('-----------------')
        print('---------------%s页采集完成---------------' % page)
        page += 1
        time.sleep(5)


if __name__ == '__main__':
    getPageList()