mirror of
https://github.com/parchlinuxB/Gitee.git
synced 2025-02-23 02:15:43 -05:00
add Baidu Engine
This commit is contained in:
parent
4f6c15f5c8
commit
9cc537a106
3 changed files with 194 additions and 31 deletions
|
@ -20,3 +20,4 @@ msgspec==0.18.6
|
||||||
eval_type_backport; python_version < '3.9'
|
eval_type_backport; python_version < '3.9'
|
||||||
typer-slim==0.14.0
|
typer-slim==0.14.0
|
||||||
isodate==0.7.2
|
isodate==0.7.2
|
||||||
|
beautifulsoup4==4.12.2
|
||||||
|
|
156
searx/engines/baidu.py
Normal file
156
searx/engines/baidu.py
Normal file
|
@ -0,0 +1,156 @@
|
||||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
||||||
|
"""
|
||||||
|
Baidu
|
||||||
|
"""
|
||||||
|
|
||||||
|
from json import loads
|
||||||
|
from urllib.parse import urlencode, urlparse
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from searx.exceptions import SearxEngineException
|
||||||
|
|
||||||
|
# import requests
|
||||||
|
# about
|
||||||
|
about = {
|
||||||
|
"website": "https://github.com/",
|
||||||
|
"wikidata_id": "Q14772",
|
||||||
|
"use_official_api": False,
|
||||||
|
"require_api_key": False,
|
||||||
|
"results": "JSON",
|
||||||
|
}
|
||||||
|
|
||||||
|
# engine dependent config
|
||||||
|
categories = ["general"]
|
||||||
|
|
||||||
|
# search-url
|
||||||
|
baidu_host_url = "https://www.baidu.com"
|
||||||
|
baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&{query}"
|
||||||
|
|
||||||
|
ABSTRACT_MAX_LENGTH = 500
|
||||||
|
|
||||||
|
|
||||||
|
# do search-request
|
||||||
|
def request(query, params):
|
||||||
|
|
||||||
|
offset = (params["pageno"] - 1) * 10
|
||||||
|
params["url"] = baidu_search_url.format(
|
||||||
|
query=urlencode({"wd": query, "pn": offset})
|
||||||
|
)
|
||||||
|
# headers
|
||||||
|
params["headers"][
|
||||||
|
"Accept"
|
||||||
|
] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
|
||||||
|
params["headers"]["Accept-Language"] = "zh-CN,zh;q=0.9"
|
||||||
|
params["headers"]["Content-Type"] = "application/x-www-form-urlencoded"
|
||||||
|
params["headers"][
|
||||||
|
"User-Agent"
|
||||||
|
] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
|
||||||
|
params["headers"]["Accept-Encoding"] = "gzip, deflate"
|
||||||
|
params["headers"]["Referer"] = "https://www.baidu.com/"
|
||||||
|
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
# get response from search-request
|
||||||
|
def response(resp):
|
||||||
|
results = []
|
||||||
|
try:
|
||||||
|
resp.encoding = "utf-8"
|
||||||
|
root = BeautifulSoup(resp.text, "lxml")
|
||||||
|
div_contents = root.find("div", id="content_left")
|
||||||
|
for div in div_contents.contents:
|
||||||
|
if type(div) != type(div_contents):
|
||||||
|
continue
|
||||||
|
|
||||||
|
class_list = div.get("class", [])
|
||||||
|
if not class_list:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "c-container" not in class_list:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = ""
|
||||||
|
url = ""
|
||||||
|
abstract = ""
|
||||||
|
if "xpath-log" in class_list:
|
||||||
|
if div.h3:
|
||||||
|
title = div.h3.text.strip()
|
||||||
|
url = div.h3.a["href"].strip()
|
||||||
|
else:
|
||||||
|
title = div.text.strip().split("\n", 1)[0]
|
||||||
|
if div.a:
|
||||||
|
url = div.a["href"].strip()
|
||||||
|
|
||||||
|
if div.find("div", class_="c-abstract"):
|
||||||
|
abstract = div.find("div", class_="c-abstract").text.strip()
|
||||||
|
elif div.div:
|
||||||
|
abstract = div.div.text.strip()
|
||||||
|
else:
|
||||||
|
abstract = div.text.strip().split("\n", 1)[1].strip()
|
||||||
|
elif "result-op" in class_list:
|
||||||
|
if div.h3:
|
||||||
|
title = div.h3.text.strip()
|
||||||
|
url = div.h3.a["href"].strip()
|
||||||
|
else:
|
||||||
|
title = div.text.strip().split("\n", 1)[0]
|
||||||
|
url = div.a["href"].strip()
|
||||||
|
if div.find("div", class_="c-abstract"):
|
||||||
|
abstract = div.find("div", class_="c-abstract").text.strip()
|
||||||
|
elif div.div:
|
||||||
|
abstract = div.div.text.strip()
|
||||||
|
else:
|
||||||
|
# abstract = div.text.strip()
|
||||||
|
abstract = div.text.strip().split("\n", 1)[1].strip()
|
||||||
|
else:
|
||||||
|
if div.get("tpl", "") != "se_com_default":
|
||||||
|
if div.get("tpl", "") == "se_st_com_abstract":
|
||||||
|
if len(div.contents) >= 1:
|
||||||
|
title = div.h3.text.strip()
|
||||||
|
if div.find("div", class_="c-abstract"):
|
||||||
|
abstract = div.find(
|
||||||
|
"div", class_="c-abstract"
|
||||||
|
).text.strip()
|
||||||
|
elif div.div:
|
||||||
|
abstract = div.div.text.strip()
|
||||||
|
else:
|
||||||
|
abstract = div.text.strip()
|
||||||
|
else:
|
||||||
|
if len(div.contents) >= 2:
|
||||||
|
if div.h3:
|
||||||
|
title = div.h3.text.strip()
|
||||||
|
url = div.h3.a["href"].strip()
|
||||||
|
else:
|
||||||
|
title = div.contents[0].text.strip()
|
||||||
|
url = div.h3.a["href"].strip()
|
||||||
|
# abstract = div.contents[-1].text
|
||||||
|
if div.find("div", class_="c-abstract"):
|
||||||
|
abstract = div.find(
|
||||||
|
"div", class_="c-abstract"
|
||||||
|
).text.strip()
|
||||||
|
elif div.div:
|
||||||
|
abstract = div.div.text.strip()
|
||||||
|
else:
|
||||||
|
abstract = div.text.strip()
|
||||||
|
else:
|
||||||
|
if div.h3:
|
||||||
|
title = div.h3.text.strip()
|
||||||
|
url = div.h3.a["href"].strip()
|
||||||
|
else:
|
||||||
|
title = div.contents[0].text.strip()
|
||||||
|
url = div.h3.a["href"].strip()
|
||||||
|
if div.find("div", class_="c-abstract"):
|
||||||
|
abstract = div.find("div", class_="c-abstract").text.strip()
|
||||||
|
elif div.div:
|
||||||
|
abstract = div.div.text.strip()
|
||||||
|
else:
|
||||||
|
abstract = div.text.strip()
|
||||||
|
|
||||||
|
if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:
|
||||||
|
abstract = abstract[:ABSTRACT_MAX_LENGTH]
|
||||||
|
# re = requests.Session.get(url, allow_redirects=False)
|
||||||
|
# url = re.headers['location']
|
||||||
|
# append result
|
||||||
|
results.append({"url": url, "title": title, "content": abstract})
|
||||||
|
except Exception as e:
|
||||||
|
raise SearxEngineException()
|
||||||
|
# return results
|
||||||
|
return results
|
|
@ -15,7 +15,7 @@ general:
|
||||||
# expose stats in open metrics format at /metrics
|
# expose stats in open metrics format at /metrics
|
||||||
# leave empty to disable (no password set)
|
# leave empty to disable (no password set)
|
||||||
# open_metrics: <password>
|
# open_metrics: <password>
|
||||||
open_metrics: ''
|
open_metrics: ""
|
||||||
|
|
||||||
brand:
|
brand:
|
||||||
new_issue_url: https://github.com/searxng/searxng/issues/new
|
new_issue_url: https://github.com/searxng/searxng/issues/new
|
||||||
|
@ -84,7 +84,7 @@ server:
|
||||||
bind_address: "127.0.0.1"
|
bind_address: "127.0.0.1"
|
||||||
# public URL of the instance, to ensure correct inbound links. Is overwritten
|
# public URL of the instance, to ensure correct inbound links. Is overwritten
|
||||||
# by ${SEARXNG_URL}.
|
# by ${SEARXNG_URL}.
|
||||||
base_url: false # "http://example.com/location"
|
base_url: false # "http://example.com/location"
|
||||||
# rate limit the number of request on the instance, block some bots.
|
# rate limit the number of request on the instance, block some bots.
|
||||||
# Is overwritten by ${SEARXNG_LIMITER}
|
# Is overwritten by ${SEARXNG_LIMITER}
|
||||||
limiter: false
|
limiter: false
|
||||||
|
@ -95,7 +95,7 @@ server:
|
||||||
# If your instance owns a /etc/searxng/settings.yml file, then set the following
|
# If your instance owns a /etc/searxng/settings.yml file, then set the following
|
||||||
# values there.
|
# values there.
|
||||||
|
|
||||||
secret_key: "ultrasecretkey" # Is overwritten by ${SEARXNG_SECRET}
|
secret_key: "ultrasecretkey" # Is overwritten by ${SEARXNG_SECRET}
|
||||||
# Proxy image results through SearXNG. Is overwritten by ${SEARXNG_IMAGE_PROXY}
|
# Proxy image results through SearXNG. Is overwritten by ${SEARXNG_IMAGE_PROXY}
|
||||||
image_proxy: false
|
image_proxy: false
|
||||||
# 1.0 and 1.1 are supported
|
# 1.0 and 1.1 are supported
|
||||||
|
@ -290,17 +290,17 @@ checker:
|
||||||
lang: en
|
lang: en
|
||||||
result_container:
|
result_container:
|
||||||
- not_empty
|
- not_empty
|
||||||
- ['one_title_contains', 'citizen kane']
|
- ["one_title_contains", "citizen kane"]
|
||||||
test:
|
test:
|
||||||
- unique_results
|
- unique_results
|
||||||
|
|
||||||
android: &test_android
|
android: &test_android
|
||||||
matrix:
|
matrix:
|
||||||
query: ['android']
|
query: ["android"]
|
||||||
lang: ['en', 'de', 'fr', 'zh-CN']
|
lang: ["en", "de", "fr", "zh-CN"]
|
||||||
result_container:
|
result_container:
|
||||||
- not_empty
|
- not_empty
|
||||||
- ['one_title_contains', 'google']
|
- ["one_title_contains", "google"]
|
||||||
test:
|
test:
|
||||||
- unique_results
|
- unique_results
|
||||||
|
|
||||||
|
@ -337,7 +337,8 @@ engines:
|
||||||
categories: ["images"]
|
categories: ["images"]
|
||||||
# https://docs.searxng.org/dev/engines/online/adobe_stock.html
|
# https://docs.searxng.org/dev/engines/online/adobe_stock.html
|
||||||
adobe_order: relevance
|
adobe_order: relevance
|
||||||
adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
|
adobe_content_types:
|
||||||
|
["photo", "illustration", "zip_vector", "template", "3d", "image"]
|
||||||
timeout: 6
|
timeout: 6
|
||||||
disabled: true
|
disabled: true
|
||||||
|
|
||||||
|
@ -439,6 +440,12 @@ engines:
|
||||||
# engine: base
|
# engine: base
|
||||||
# shortcut: bs
|
# shortcut: bs
|
||||||
|
|
||||||
|
- name: baidu
|
||||||
|
engine: baidu
|
||||||
|
shortcut: bd
|
||||||
|
timeout: 2.0
|
||||||
|
categories: general
|
||||||
|
|
||||||
- name: bandcamp
|
- name: bandcamp
|
||||||
engine: bandcamp
|
engine: bandcamp
|
||||||
shortcut: bc
|
shortcut: bc
|
||||||
|
@ -525,12 +532,12 @@ engines:
|
||||||
engine: cloudflareai
|
engine: cloudflareai
|
||||||
shortcut: cfai
|
shortcut: cfai
|
||||||
# get api token and accont id from https://developers.cloudflare.com/workers-ai/get-started/rest-api/
|
# get api token and accont id from https://developers.cloudflare.com/workers-ai/get-started/rest-api/
|
||||||
cf_account_id: 'your_cf_accout_id'
|
cf_account_id: "your_cf_accout_id"
|
||||||
cf_ai_api: 'your_cf_api'
|
cf_ai_api: "your_cf_api"
|
||||||
# create your ai gateway by https://developers.cloudflare.com/ai-gateway/get-started/creating-gateway/
|
# create your ai gateway by https://developers.cloudflare.com/ai-gateway/get-started/creating-gateway/
|
||||||
cf_ai_gateway: 'your_cf_ai_gateway_name'
|
cf_ai_gateway: "your_cf_ai_gateway_name"
|
||||||
# find the model name from https://developers.cloudflare.com/workers-ai/models/#text-generation
|
# find the model name from https://developers.cloudflare.com/workers-ai/models/#text-generation
|
||||||
cf_ai_model: 'ai_model_name'
|
cf_ai_model: "ai_model_name"
|
||||||
# custom your preferences
|
# custom your preferences
|
||||||
# cf_ai_model_display_name: 'Cloudflare AI'
|
# cf_ai_model_display_name: 'Cloudflare AI'
|
||||||
# cf_ai_model_assistant: 'prompts_for_assistant_role'
|
# cf_ai_model_assistant: 'prompts_for_assistant_role'
|
||||||
|
@ -601,7 +608,7 @@ engines:
|
||||||
categories: general
|
categories: general
|
||||||
disabled: true
|
disabled: true
|
||||||
paging: true
|
paging: true
|
||||||
lang_all: ''
|
lang_all: ""
|
||||||
search_url: https://curlie.org/search?q={query}&lang={lang}&start={pageno}&stime=92452189
|
search_url: https://curlie.org/search?q={query}&lang={lang}&start={pageno}&stime=92452189
|
||||||
page_size: 20
|
page_size: 20
|
||||||
results_xpath: //div[@id="site-list-content"]/div[@class="site-item"]
|
results_xpath: //div[@id="site-list-content"]/div[@class="site-item"]
|
||||||
|
@ -1654,32 +1661,32 @@ engines:
|
||||||
- name: stackoverflow
|
- name: stackoverflow
|
||||||
engine: stackexchange
|
engine: stackexchange
|
||||||
shortcut: st
|
shortcut: st
|
||||||
api_site: 'stackoverflow'
|
api_site: "stackoverflow"
|
||||||
categories: [it, q&a]
|
categories: [it, q&a]
|
||||||
|
|
||||||
- name: askubuntu
|
- name: askubuntu
|
||||||
engine: stackexchange
|
engine: stackexchange
|
||||||
shortcut: ubuntu
|
shortcut: ubuntu
|
||||||
api_site: 'askubuntu'
|
api_site: "askubuntu"
|
||||||
categories: [it, q&a]
|
categories: [it, q&a]
|
||||||
|
|
||||||
- name: superuser
|
- name: superuser
|
||||||
engine: stackexchange
|
engine: stackexchange
|
||||||
shortcut: su
|
shortcut: su
|
||||||
api_site: 'superuser'
|
api_site: "superuser"
|
||||||
categories: [it, q&a]
|
categories: [it, q&a]
|
||||||
|
|
||||||
- name: discuss.python
|
- name: discuss.python
|
||||||
engine: discourse
|
engine: discourse
|
||||||
shortcut: dpy
|
shortcut: dpy
|
||||||
base_url: 'https://discuss.python.org'
|
base_url: "https://discuss.python.org"
|
||||||
categories: [it, q&a]
|
categories: [it, q&a]
|
||||||
disabled: true
|
disabled: true
|
||||||
|
|
||||||
- name: caddy.community
|
- name: caddy.community
|
||||||
engine: discourse
|
engine: discourse
|
||||||
shortcut: caddy
|
shortcut: caddy
|
||||||
base_url: 'https://caddy.community'
|
base_url: "https://caddy.community"
|
||||||
categories: [it, q&a]
|
categories: [it, q&a]
|
||||||
disabled: true
|
disabled: true
|
||||||
|
|
||||||
|
@ -1687,7 +1694,7 @@ engines:
|
||||||
engine: discourse
|
engine: discourse
|
||||||
shortcut: pi
|
shortcut: pi
|
||||||
categories: [it, q&a]
|
categories: [it, q&a]
|
||||||
base_url: 'https://discourse.pi-hole.net'
|
base_url: "https://discourse.pi-hole.net"
|
||||||
disabled: true
|
disabled: true
|
||||||
|
|
||||||
- name: searchcode code
|
- name: searchcode code
|
||||||
|
@ -1800,8 +1807,7 @@ engines:
|
||||||
- name: torch
|
- name: torch
|
||||||
engine: xpath
|
engine: xpath
|
||||||
paging: true
|
paging: true
|
||||||
search_url:
|
search_url: http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and
|
||||||
http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and
|
|
||||||
results_xpath: //table//tr
|
results_xpath: //table//tr
|
||||||
url_xpath: ./td[2]/a
|
url_xpath: ./td[2]/a
|
||||||
title_xpath: ./td[2]/b
|
title_xpath: ./td[2]/b
|
||||||
|
@ -1971,7 +1977,7 @@ engines:
|
||||||
lang: en
|
lang: en
|
||||||
result_container:
|
result_container:
|
||||||
- not_empty
|
- not_empty
|
||||||
- ['one_title_contains', 'Tardigrada']
|
- ["one_title_contains", "Tardigrada"]
|
||||||
test:
|
test:
|
||||||
- unique_results
|
- unique_results
|
||||||
|
|
||||||
|
@ -2208,7 +2214,7 @@ engines:
|
||||||
disabled: true
|
disabled: true
|
||||||
# if you aren't using HTTPS for your local yacy instance disable https
|
# if you aren't using HTTPS for your local yacy instance disable https
|
||||||
# enable_http: false
|
# enable_http: false
|
||||||
search_mode: 'global'
|
search_mode: "global"
|
||||||
# timeout can be reduced in 'local' search mode
|
# timeout can be reduced in 'local' search mode
|
||||||
timeout: 5.0
|
timeout: 5.0
|
||||||
|
|
||||||
|
@ -2256,7 +2262,7 @@ engines:
|
||||||
no_result_for_http_status: [404]
|
no_result_for_http_status: [404]
|
||||||
about:
|
about:
|
||||||
website: https://www.woxikon.de/
|
website: https://www.woxikon.de/
|
||||||
wikidata_id: # No Wikidata ID
|
wikidata_id: # No Wikidata ID
|
||||||
use_official_api: false
|
use_official_api: false
|
||||||
require_api_key: false
|
require_api_key: false
|
||||||
results: HTML
|
results: HTML
|
||||||
|
@ -2513,11 +2519,11 @@ engines:
|
||||||
# keys: ['line']
|
# keys: ['line']
|
||||||
|
|
||||||
doi_resolvers:
|
doi_resolvers:
|
||||||
oadoi.org: 'https://oadoi.org/'
|
oadoi.org: "https://oadoi.org/"
|
||||||
doi.org: 'https://doi.org/'
|
doi.org: "https://doi.org/"
|
||||||
doai.io: 'https://dissem.in/'
|
doai.io: "https://dissem.in/"
|
||||||
sci-hub.se: 'https://sci-hub.se/'
|
sci-hub.se: "https://sci-hub.se/"
|
||||||
sci-hub.st: 'https://sci-hub.st/'
|
sci-hub.st: "https://sci-hub.st/"
|
||||||
sci-hub.ru: 'https://sci-hub.ru/'
|
sci-hub.ru: "https://sci-hub.ru/"
|
||||||
|
|
||||||
default_doi_resolver: 'oadoi.org'
|
default_doi_resolver: "oadoi.org"
|
||||||
|
|
Loading…
Add table
Reference in a new issue