# coding=utf-8
#!/usr/bin/python
import sys
sys.path.append('..')
from base.spider import Spider
import json
import urllib.parse
import re
class Spider(Spider):
def getName(self):
return "快递🔞"
def init(self, extend=""):
self.host = "https://www.xjjkdfw.sbs"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 11; M2007J3SC Build/RKQ1.200826.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/77.0.3865.120 MQQBrowser/6.2 TBS/045713 Mobile Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q.0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Referer': self.host
}
self.log(f"快递🔞爬虫初始化完成,主站: {self.host}")
def isVideoFormat(self, url):
return False
def manualVideoCheck(self):
return True
def homeContent(self, filter):
"""获取首页内容和分类"""
result = {}
classes = self._getCategories()
result['class'] = classes
try:
rsp = self.fetch(self.host, headers=self.headers)
html = rsp.text
videos = self._getVideos(html)
result['list'] = videos
except Exception as e:
self.log(f"首页获取出错: {str(e)}")
result['list'] = []
return result
def homeVideoContent(self):
"""首页视频内容(可留空)"""
return {'list': []}
def categoryContent(self, tid, pg, filter, extend):
"""分类内容"""
try:
pg_int = int(pg)
if pg_int == 1:
url = f"{self.host}/vodtype/{tid}.html"
else:
url = f"{self.host}/vodtype/{tid}/page/{pg_int}.html"
self.log(f"访问分类URL: {url}")
rsp = self.fetch(url, headers=self.headers)
html = rsp.text
videos = self._getVideos(html)
pagecount = 999
page_links = re.findall(r']*src=["\']([^"\']+)["\']', html)
if iframe_match:
iframe_url = iframe_match.group(1).strip()
if iframe_url.startswith('//'):
iframe_url = 'https:' + iframe_url
elif iframe_url.startswith('/') and not iframe_url.startswith('http'):
iframe_url = self.host.rstrip('/') + iframe_url
self.log(f"📹 找到iframe播放源: {iframe_url}")
return {'parse': 1, 'playUrl': '', 'url': iframe_url}
# 3. 最后手段:返回播放页本身,让播放器自己嗅探
self.log(f"⚠️ 未找到播放源,返回原始播放页")
return {'parse': 1, 'playUrl': '', 'url': play_page_url}
except Exception as e:
self.log(f"播放链接获取出错 (id: {id}): {str(e)}")
return {'parse': 1, 'playUrl': '', 'url': f"{self.host}/vodplay/{id}.html"}
# ========== 辅助方法 ==========
def _getCategories(self):
"""从首页提取分类"""
try:
rsp = self.fetch(self.host, headers=self.headers)
html = rsp.text
categories = []
pattern = r']*>([^<]+)'
matches = re.findall(pattern, html)
seen = set()
for tid, name in matches:
if name.strip() and tid not in seen:
seen.add(tid)
categories.append({'type_id': tid, 'type_name': name.strip()})
return categories
except Exception as e:
self.log(f"获取分类出错: {str(e)}")
return []
def _getVideos(self, html):
"""从HTML中提取视频列表"""
videos = []
# 匹配结构:
#
# 
分类 - 日期
pattern = r']*href="(/vodplay/(\d+)-\d+-\d+\.html)"[^>]*>.*?data-original="([^"]+)".*?.*?]*>([^<]+).*?([^<]+?)\s*-\s*([^<]+)' matches = re.findall(pattern, html, re.DOTALL | re.IGNORECASE) for full_play_link, vid, pic, title, category, date in matches: if not pic.startswith('http'): pic = self.host + pic if pic.startswith('/') else 'https:' + pic if pic.startswith('//') else pic video = { 'vod_id': vid, 'vod_name': title.strip(), 'vod_pic': pic, 'vod_remarks': f"{category.strip()} | {date.strip()}" } videos.append(video) return videos def _getDetail(self, html, vid): """获取详情信息""" try: # 标题 title = self.regStr(r'