add Baidu Engine

This commit is contained in:
Gnkalk 2024-12-10 20:40:48 +03:30
parent 4f6c15f5c8
commit 9cc537a106
3 changed files with 194 additions and 31 deletions

View file

@ -20,3 +20,4 @@ msgspec==0.18.6
eval_type_backport; python_version < '3.9' eval_type_backport; python_version < '3.9'
typer-slim==0.14.0 typer-slim==0.14.0
isodate==0.7.2 isodate==0.7.2
beautifulsoup4==4.12.2

156
searx/engines/baidu.py Normal file
View file

@ -0,0 +1,156 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
"""
Baidu
"""
from json import loads
from urllib.parse import urlencode, urlparse
from bs4 import BeautifulSoup
from searx.exceptions import SearxEngineException
# import requests
# about
about = {
"website": "https://github.com/",
"wikidata_id": "Q14772",
"use_official_api": False,
"require_api_key": False,
"results": "JSON",
}
# engine dependent config
categories = ["general"]
# search-url
baidu_host_url = "https://www.baidu.com"
baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&{query}"
ABSTRACT_MAX_LENGTH = 500
# do search-request
def request(query, params):
offset = (params["pageno"] - 1) * 10
params["url"] = baidu_search_url.format(
query=urlencode({"wd": query, "pn": offset})
)
# headers
params["headers"][
"Accept"
] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
params["headers"]["Accept-Language"] = "zh-CN,zh;q=0.9"
params["headers"]["Content-Type"] = "application/x-www-form-urlencoded"
params["headers"][
"User-Agent"
] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
params["headers"]["Accept-Encoding"] = "gzip, deflate"
params["headers"]["Referer"] = "https://www.baidu.com/"
return params
# get response from search-request
def response(resp):
results = []
try:
resp.encoding = "utf-8"
root = BeautifulSoup(resp.text, "lxml")
div_contents = root.find("div", id="content_left")
for div in div_contents.contents:
if type(div) != type(div_contents):
continue
class_list = div.get("class", [])
if not class_list:
continue
if "c-container" not in class_list:
continue
title = ""
url = ""
abstract = ""
if "xpath-log" in class_list:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a["href"].strip()
else:
title = div.text.strip().split("\n", 1)[0]
if div.a:
url = div.a["href"].strip()
if div.find("div", class_="c-abstract"):
abstract = div.find("div", class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip().split("\n", 1)[1].strip()
elif "result-op" in class_list:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a["href"].strip()
else:
title = div.text.strip().split("\n", 1)[0]
url = div.a["href"].strip()
if div.find("div", class_="c-abstract"):
abstract = div.find("div", class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
# abstract = div.text.strip()
abstract = div.text.strip().split("\n", 1)[1].strip()
else:
if div.get("tpl", "") != "se_com_default":
if div.get("tpl", "") == "se_st_com_abstract":
if len(div.contents) >= 1:
title = div.h3.text.strip()
if div.find("div", class_="c-abstract"):
abstract = div.find(
"div", class_="c-abstract"
).text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip()
else:
if len(div.contents) >= 2:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a["href"].strip()
else:
title = div.contents[0].text.strip()
url = div.h3.a["href"].strip()
# abstract = div.contents[-1].text
if div.find("div", class_="c-abstract"):
abstract = div.find(
"div", class_="c-abstract"
).text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip()
else:
if div.h3:
title = div.h3.text.strip()
url = div.h3.a["href"].strip()
else:
title = div.contents[0].text.strip()
url = div.h3.a["href"].strip()
if div.find("div", class_="c-abstract"):
abstract = div.find("div", class_="c-abstract").text.strip()
elif div.div:
abstract = div.div.text.strip()
else:
abstract = div.text.strip()
if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:
abstract = abstract[:ABSTRACT_MAX_LENGTH]
# re = requests.Session.get(url, allow_redirects=False)
# url = re.headers['location']
# append result
results.append({"url": url, "title": title, "content": abstract})
except Exception as e:
raise SearxEngineException()
# return results
return results

View file

@ -15,7 +15,7 @@ general:
# expose stats in open metrics format at /metrics # expose stats in open metrics format at /metrics
# leave empty to disable (no password set) # leave empty to disable (no password set)
# open_metrics: <password> # open_metrics: <password>
open_metrics: '' open_metrics: ""
brand: brand:
new_issue_url: https://github.com/searxng/searxng/issues/new new_issue_url: https://github.com/searxng/searxng/issues/new
@ -290,17 +290,17 @@ checker:
lang: en lang: en
result_container: result_container:
- not_empty - not_empty
- ['one_title_contains', 'citizen kane'] - ["one_title_contains", "citizen kane"]
test: test:
- unique_results - unique_results
android: &test_android android: &test_android
matrix: matrix:
query: ['android'] query: ["android"]
lang: ['en', 'de', 'fr', 'zh-CN'] lang: ["en", "de", "fr", "zh-CN"]
result_container: result_container:
- not_empty - not_empty
- ['one_title_contains', 'google'] - ["one_title_contains", "google"]
test: test:
- unique_results - unique_results
@ -337,7 +337,8 @@ engines:
categories: ["images"] categories: ["images"]
# https://docs.searxng.org/dev/engines/online/adobe_stock.html # https://docs.searxng.org/dev/engines/online/adobe_stock.html
adobe_order: relevance adobe_order: relevance
adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"] adobe_content_types:
["photo", "illustration", "zip_vector", "template", "3d", "image"]
timeout: 6 timeout: 6
disabled: true disabled: true
@ -439,6 +440,12 @@ engines:
# engine: base # engine: base
# shortcut: bs # shortcut: bs
- name: baidu
engine: baidu
shortcut: bd
timeout: 2.0
categories: general
- name: bandcamp - name: bandcamp
engine: bandcamp engine: bandcamp
shortcut: bc shortcut: bc
@ -525,12 +532,12 @@ engines:
engine: cloudflareai engine: cloudflareai
shortcut: cfai shortcut: cfai
# get api token and accont id from https://developers.cloudflare.com/workers-ai/get-started/rest-api/ # get api token and accont id from https://developers.cloudflare.com/workers-ai/get-started/rest-api/
cf_account_id: 'your_cf_accout_id' cf_account_id: "your_cf_accout_id"
cf_ai_api: 'your_cf_api' cf_ai_api: "your_cf_api"
# create your ai gateway by https://developers.cloudflare.com/ai-gateway/get-started/creating-gateway/ # create your ai gateway by https://developers.cloudflare.com/ai-gateway/get-started/creating-gateway/
cf_ai_gateway: 'your_cf_ai_gateway_name' cf_ai_gateway: "your_cf_ai_gateway_name"
# find the model name from https://developers.cloudflare.com/workers-ai/models/#text-generation # find the model name from https://developers.cloudflare.com/workers-ai/models/#text-generation
cf_ai_model: 'ai_model_name' cf_ai_model: "ai_model_name"
# custom your preferences # custom your preferences
# cf_ai_model_display_name: 'Cloudflare AI' # cf_ai_model_display_name: 'Cloudflare AI'
# cf_ai_model_assistant: 'prompts_for_assistant_role' # cf_ai_model_assistant: 'prompts_for_assistant_role'
@ -601,7 +608,7 @@ engines:
categories: general categories: general
disabled: true disabled: true
paging: true paging: true
lang_all: '' lang_all: ""
search_url: https://curlie.org/search?q={query}&lang={lang}&start={pageno}&stime=92452189 search_url: https://curlie.org/search?q={query}&lang={lang}&start={pageno}&stime=92452189
page_size: 20 page_size: 20
results_xpath: //div[@id="site-list-content"]/div[@class="site-item"] results_xpath: //div[@id="site-list-content"]/div[@class="site-item"]
@ -1654,32 +1661,32 @@ engines:
- name: stackoverflow - name: stackoverflow
engine: stackexchange engine: stackexchange
shortcut: st shortcut: st
api_site: 'stackoverflow' api_site: "stackoverflow"
categories: [it, q&a] categories: [it, q&a]
- name: askubuntu - name: askubuntu
engine: stackexchange engine: stackexchange
shortcut: ubuntu shortcut: ubuntu
api_site: 'askubuntu' api_site: "askubuntu"
categories: [it, q&a] categories: [it, q&a]
- name: superuser - name: superuser
engine: stackexchange engine: stackexchange
shortcut: su shortcut: su
api_site: 'superuser' api_site: "superuser"
categories: [it, q&a] categories: [it, q&a]
- name: discuss.python - name: discuss.python
engine: discourse engine: discourse
shortcut: dpy shortcut: dpy
base_url: 'https://discuss.python.org' base_url: "https://discuss.python.org"
categories: [it, q&a] categories: [it, q&a]
disabled: true disabled: true
- name: caddy.community - name: caddy.community
engine: discourse engine: discourse
shortcut: caddy shortcut: caddy
base_url: 'https://caddy.community' base_url: "https://caddy.community"
categories: [it, q&a] categories: [it, q&a]
disabled: true disabled: true
@ -1687,7 +1694,7 @@ engines:
engine: discourse engine: discourse
shortcut: pi shortcut: pi
categories: [it, q&a] categories: [it, q&a]
base_url: 'https://discourse.pi-hole.net' base_url: "https://discourse.pi-hole.net"
disabled: true disabled: true
- name: searchcode code - name: searchcode code
@ -1800,8 +1807,7 @@ engines:
- name: torch - name: torch
engine: xpath engine: xpath
paging: true paging: true
search_url: search_url: http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and
http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and
results_xpath: //table//tr results_xpath: //table//tr
url_xpath: ./td[2]/a url_xpath: ./td[2]/a
title_xpath: ./td[2]/b title_xpath: ./td[2]/b
@ -1971,7 +1977,7 @@ engines:
lang: en lang: en
result_container: result_container:
- not_empty - not_empty
- ['one_title_contains', 'Tardigrada'] - ["one_title_contains", "Tardigrada"]
test: test:
- unique_results - unique_results
@ -2208,7 +2214,7 @@ engines:
disabled: true disabled: true
# if you aren't using HTTPS for your local yacy instance disable https # if you aren't using HTTPS for your local yacy instance disable https
# enable_http: false # enable_http: false
search_mode: 'global' search_mode: "global"
# timeout can be reduced in 'local' search mode # timeout can be reduced in 'local' search mode
timeout: 5.0 timeout: 5.0
@ -2513,11 +2519,11 @@ engines:
# keys: ['line'] # keys: ['line']
doi_resolvers: doi_resolvers:
oadoi.org: 'https://oadoi.org/' oadoi.org: "https://oadoi.org/"
doi.org: 'https://doi.org/' doi.org: "https://doi.org/"
doai.io: 'https://dissem.in/' doai.io: "https://dissem.in/"
sci-hub.se: 'https://sci-hub.se/' sci-hub.se: "https://sci-hub.se/"
sci-hub.st: 'https://sci-hub.st/' sci-hub.st: "https://sci-hub.st/"
sci-hub.ru: 'https://sci-hub.ru/' sci-hub.ru: "https://sci-hub.ru/"
default_doi_resolver: 'oadoi.org' default_doi_resolver: "oadoi.org"