fun/js/ss/小黄书.py
2026-01-07 04:36:44 +00:00

245 lines
10 KiB
Python
Executable file

import sys
import requests
import re
from urllib.parse import urljoin
sys.path.append('..')
from base.spider import Spider
class Spider(Spider):
def init(self, extend=""):
self.host = 'https://xchina001.site'
self.header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache',
}
print(f"使用站点: {self.host}")
def getName(self):
return "小黄书"
def isVideoFormat(self, url):
return any(ext in (url or '') for ext in ['.m3u8', '.mp4', '.ts'])
def manualVideoCheck(self):
return False
def _extractVideoItems(self, html_content):
vods = []
video_items = re.findall(r'<div[^>]*class="item video[^>]*>(.*?)</div>', html_content, re.S)
for item in video_items:
link_match = re.search(r'<a[^>]*href="(.*?)"[^>]*title="(.*?)"[^>]*>', item)
if link_match:
href = link_match.group(1)
title = link_match.group(2)
img = ''
img_match = re.search(r'background-image:url\((.*?)\)', item)
if img_match:
img = img_match.group(1).strip('"\'')
if not img.startswith(('http://', 'https://')):
img = urljoin(self.host, img)
vods.append({
'vod_id': href,
'vod_name': title.strip(),
'vod_pic': img,
'vod_remarks': ''
})
if not vods:
general_items = re.findall(r'<a[^>]*href="(/videos/.*?)"[^>]*title="(.*?)"[^>]*>', html_content, re.S)
for href, title in general_items:
full_href = urljoin(self.host, href)
vods.append({
'vod_id': full_href,
'vod_name': title.strip(),
'vod_pic': '',
'vod_remarks': ''
})
return vods
def homeContent(self, filter):
result = {}
classes = []
video_classes = [
{'type_name': '麻豆传媒', 'type_id': '/videos/series-5f904550b8fcc.html'},
{'type_name': '独立创作者', 'type_id': '/videos/series-61bf6e439fed6.html'},
{'type_name': '糖心Vlog', 'type_id': '/videos/series-61014080dbfde.html'},
{'type_name': '蜜桃传媒', 'type_id': '/videos/series-5fe8403919165.html'},
{'type_name': '星空传媒', 'type_id': '/videos/series-6054e93356ded.html'},
{'type_name': '天美传媒', 'type_id': '/videos/series-60153c49058ce.html'},
{'type_name': '果冻传媒', 'type_id': '/videos/series-5fe840718d665.html'},
{'type_name': '香蕉视频', 'type_id': '/videos/series-65e5f74e4605c.html'},
{'type_name': '精东影业', 'type_id': '/videos/series-60126bcfb97fa.html'},
{'type_name': '爱豆传媒', 'type_id': '/videos/series-63d134c7a0a15.html'},
{'type_name': '杏吧原版', 'type_id': '/videos/series-6072997559b46.html'},
{'type_name': 'IBiZa Media', 'type_id': '/videos/series-64e9cce89da21.html'},
{'type_name': '性视界', 'type_id': '/videos/series-63490362dac45.html'},
{'type_name': 'ED Mosaic', 'type_id': '/videos/series-63732f5c3d36b.html'},
{'type_name': '大象传媒', 'type_id': '/videos/series-65bcaa9688514.html'},
{'type_name': '扣扣传媒', 'type_id': '/videos/series-6230974ada989.html'},
{'type_name': '萝莉社', 'type_id': '/videos/series-6360ca9706ecb.html'},
{'type_name': 'SA国际传媒', 'type_id': '/videos/series-633ef3ef07d33.html'},
{'type_name': '其他中文AV', 'type_id': '/videos/series-63986aec205d8.html'}
]
classes.extend(video_classes)
result['class'] = classes
result['filters'] = {}
return result
def categoryContent(self, tid, pg, filter, extend):
result = {}
if tid.startswith('http'):
url = tid
else:
url = urljoin(self.host, tid)
pg = int(pg) if pg else 1
if pg > 1:
if '?' in url:
url += f"&page={pg}"
else:
url += f"?page={pg}"
try:
res = requests.get(url, headers=self.header, timeout=10)
res.encoding = 'utf-8'
html_content = res.text
# 使用辅助方法提取视频项
vods = self._extractVideoItems(html_content)
result['list'] = vods
current_page_items = len(vods)
has_next_page = '下一页' in html_content or 'next' in html_content.lower() or f'page={pg+1}' in html_content
if has_next_page:
pagecount = pg + 1
total = pagecount * current_page_items
else:
pagecount = pg
total = current_page_items
result['page'] = pg
result['pagecount'] = pagecount
result['limit'] = current_page_items
result['total'] = total
except Exception as e:
print(f"categoryContent error: {e}")
result['list'] = []
result['page'] = pg
result['pagecount'] = 1
result['limit'] = 30
result['total'] = 0
return result
def detailContent(self, ids):
vid = ids[0]
url = vid if 'http' in vid else urljoin(self.host, vid)
vod = {
'vod_id': vid,
'vod_name': '小黄书视频',
'vod_pic': '',
'type_name': '',
'vod_year': '',
'vod_area': '',
'vod_remarks': '',
'vod_actor': '',
'vod_director': '',
'vod_content': ''
}
try:
res = requests.get(url, headers=self.header, timeout=10)
res.encoding = 'utf-8'
html_content = res.text
title_match = re.search(r'<h1[^>]*>(.*?)</h1>', html_content, re.S)
if title_match:
vod['vod_name'] = title_match.group(1).strip()
else:
title_match_alt = re.search(r'<title>(.*?)</title>', html_content, re.S)
if title_match_alt:
full_title = title_match_alt.group(1).strip()
vod['vod_name'] = full_title.split(" - ")[0] if " - " in full_title else full_title
cover_match = re.search(r'<meta property="og:image" content="(.*?)"', html_content, re.S)
if cover_match:
cover_img = cover_match.group(1).strip()
if not cover_img.startswith(('http://', 'https://')):
cover_img = urljoin(self.host, cover_img)
vod['vod_pic'] = cover_img
desc_match = re.search(r'<meta name="description" content="(.*?)">', html_content, re.S)
if desc_match:
vod['vod_content'] = desc_match.group(1).strip()
else:
jsonld_match = re.search(r'<script type="application/ld\+json">(.*?)</script>', html_content, re.S)
if jsonld_match:
try:
import json
jsonld_data = json.loads(jsonld_match.group(1))
if isinstance(jsonld_data, list):
for item in jsonld_data:
if isinstance(item, dict) and 'description' in item:
vod['vod_content'] = item['description']
break
except:
pass
vod['vod_play_from'] = '瑟佬在线'
vod['vod_play_url'] = f'开撸${url}'
except Exception as e:
print(f"detailContent error: {e}")
return {'list': [vod]}
def playerContent(self, flag, id, vipFlags):
url = id
try:
res = requests.get(url, headers=self.header, timeout=10)
res.encoding = 'utf-8'
html = res.text
videoplayer_pattern = re.compile(r'const player = new VideoPlayer\(.*?src:\s*["\']([^"\']+?)["\']', re.S)
videoplayer_match = videoplayer_pattern.search(html)
if videoplayer_match:
video_url = videoplayer_match.group(1)
if re.search(r'\.(m3u8|mp4|ts)', video_url):
return {
'jx': 0,
'parse': 0,
'url': video_url,
'header': {
'User-Agent': 'Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Mobile Safari/537.36',
'Referer': url
}
}
except Exception as e:
print(f"playerContent解析错误: {e}")
return {'parse': 1, 'url': url, 'header': self.header}
def searchContent(self, key, quick):
result = {'list': []}
try:
search_url = f'{self.host}/search?q={key}'
res = requests.get(search_url, headers=self.header, timeout=10)
res.encoding = 'utf-8'
html_content = res.text
vods = self._extractVideoItems(html_content)
result['list'] = vods
except Exception as e:
print(f"searchContent error: {e}")
return result
def homeVideoContent(self):
try:
url = self.host
res = requests.get(url, headers=self.header, timeout=10)
res.encoding = 'utf-8'
html_content = res.text
vods = self._extractVideoItems(html_content)
return {'list': vods}
except Exception as e:
print(f"homeVideoContent error: {e}")
return {'list': []}
def localProxy(self, params):
return None