Python基于Searx进行信息搜索
Python版本:3.7
代码如下:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import time
from typing import List, Dict
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler('searx_search.log'),
logging.StreamHandler()
]
)
# Searx实例列表URL
SEARX_INSTANCES_URL = 'https://data.myfpga.cn/searx.txt'
# 最大并发数
MAX_CONCURRENT = 3
class SearxSearcher:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({'User-Agent': 'Mozilla/5.0'})
self.search_instances = self._load_instances()
self.executor = ThreadPoolExecutor(max_workers=MAX_CONCURRENT)
def _load_instances(self) -> List[str]:
"""从URL加载Searx实例列表"""
try:
response = self.session.get(SEARX_INSTANCES_URL, timeout=10)
return [i.strip() for i in response.text.split('\n') if i.strip()][:10]
except Exception as e:
logging.error(f"实例加载失败: {str(e)}")
return ["https://search.us.projectsegfau.lt"]
def search(self, query: str, pages: int = 10) -> List[Dict]:
"""搜索并解析结果"""
futures = {
self.executor.submit(self._search_instance, instance, query, pages): instance
for instance in self.search_instances[:MAX_CONCURRENT]
}
results = []
for future in as_completed(futures):
instance_results = future.result() # 避免使用海象运算符
if instance_results:
results.extend(instance_results)
return results[:pages * 10] # 返回前10页的结果
def _search_instance(self, instance: str, query: str, pages: int) -> List[Dict]:
"""在单个Searx实例上搜索并解析结果"""
results = []
for page in range(1, pages + 1):
try:
response = self.session.get(
f"{instance}/search",
params={
'q': query,
'category_general': 1,
'language': 'auto',
'time_range': '',
'safesearch': 0,
'theme': 'simple',
'pageno': page
},
timeout=15
)
if not response.ok:
logging.warning(f"请求失败: {instance} 第 {page} 页")
break
soup = BeautifulSoup(response.text, 'html.parser')
main_div = soup.find('div', id='results')
if not main_div:
logging.warning(f"未找到结果: {instance} 第 {page} 页")
break
for article in main_div.find_all('article', class_='result'):
title = article.find('h3').get_text(strip=True) if article.find('h3') else '无标题'
url = article.find('a', class_='url_header')['href'] if article.find('a', class_='url_header') else '无URL'
content = article.find('p', class_='content').get_text(strip=True) if article.find('p', class_='content') else '无内容'
results.append({
'title': title,
'url': url,
'content': content
})
time.sleep(0.5) # 防止请求过快
except Exception as e:
logging.error(f"解析失败: {instance} 第 {page} 页 - {str(e)}")
break
return results
if __name__ == "__main__":
searcher = SearxSearcher()
query = "myfpga.cn"
results = searcher.search(query, pages=10)
for i, result in enumerate(results, 1):
print(f"结果 {i}:")
print(f"标题: {result['title']}")
print(f"URL: {result['url']}")
print(f"内容: {result['content']}")
print("-" * 80)


