diff --git a/requirements.txt b/requirements.txt index 0af1c1901..5cfa61268 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,4 @@ msgspec==0.18.6 eval_type_backport; python_version < '3.9' typer-slim==0.14.0 isodate==0.7.2 +beautifulsoup4==4.12.2 diff --git a/searx/engines/baidu.py b/searx/engines/baidu.py new file mode 100644 index 000000000..9a269cbd6 --- /dev/null +++ b/searx/engines/baidu.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Baidu +""" + +from json import loads +from urllib.parse import urlencode, urlparse +from bs4 import BeautifulSoup +from searx.exceptions import SearxEngineException + +# import requests +# about +about = { + "website": "https://github.com/", + "wikidata_id": "Q14772", + "use_official_api": False, + "require_api_key": False, + "results": "JSON", +} + +# engine dependent config +categories = ["general"] + +# search-url +baidu_host_url = "https://www.baidu.com" +baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&{query}" + +ABSTRACT_MAX_LENGTH = 500 + + +# do search-request +def request(query, params): + + offset = (params["pageno"] - 1) * 10 + params["url"] = baidu_search_url.format( + query=urlencode({"wd": query, "pn": offset}) + ) + # headers + params["headers"][ + "Accept" + ] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" + params["headers"]["Accept-Language"] = "zh-CN,zh;q=0.9" + params["headers"]["Content-Type"] = "application/x-www-form-urlencoded" + params["headers"][ + "User-Agent" + ] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36" + params["headers"]["Accept-Encoding"] = "gzip, deflate" + params["headers"]["Referer"] = "https://www.baidu.com/" + + return params + + +# get response from search-request +def response(resp): + results = [] + try: + resp.encoding = "utf-8" + root = BeautifulSoup(resp.text, "lxml") + div_contents = root.find("div", id="content_left") + for div in div_contents.contents: + if type(div) != type(div_contents): + continue + + class_list = div.get("class", []) + if not class_list: + continue + + if "c-container" not in class_list: + continue + + title = "" + url = "" + abstract = "" + if "xpath-log" in class_list: + if div.h3: + title = div.h3.text.strip() + url = div.h3.a["href"].strip() + else: + title = div.text.strip().split("\n", 1)[0] + if div.a: + url = div.a["href"].strip() + + if div.find("div", class_="c-abstract"): + abstract = div.find("div", class_="c-abstract").text.strip() + elif div.div: + abstract = div.div.text.strip() + else: + abstract = div.text.strip().split("\n", 1)[1].strip() + elif "result-op" in class_list: + if div.h3: + title = div.h3.text.strip() + url = div.h3.a["href"].strip() + else: + title = div.text.strip().split("\n", 1)[0] + url = div.a["href"].strip() + if div.find("div", class_="c-abstract"): + abstract = div.find("div", class_="c-abstract").text.strip() + elif div.div: + abstract = div.div.text.strip() + else: + # abstract = div.text.strip() + abstract = div.text.strip().split("\n", 1)[1].strip() + else: + if div.get("tpl", "") != "se_com_default": + if div.get("tpl", "") == "se_st_com_abstract": + if len(div.contents) >= 1: + title = div.h3.text.strip() + if div.find("div", class_="c-abstract"): + abstract = div.find( + "div", class_="c-abstract" + ).text.strip() + elif div.div: + abstract = div.div.text.strip() + else: + abstract = div.text.strip() + else: + if len(div.contents) >= 2: + if div.h3: + title = div.h3.text.strip() + url = div.h3.a["href"].strip() + else: + title = div.contents[0].text.strip() + url = div.h3.a["href"].strip() + # abstract = div.contents[-1].text + if div.find("div", class_="c-abstract"): + abstract = div.find( + "div", class_="c-abstract" + ).text.strip() + elif div.div: + abstract = div.div.text.strip() + else: + abstract = div.text.strip() + else: + if div.h3: + title = div.h3.text.strip() + url = div.h3.a["href"].strip() + else: + title = div.contents[0].text.strip() + url = div.h3.a["href"].strip() + if div.find("div", class_="c-abstract"): + abstract = div.find("div", class_="c-abstract").text.strip() + elif div.div: + abstract = div.div.text.strip() + else: + abstract = div.text.strip() + + if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH: + abstract = abstract[:ABSTRACT_MAX_LENGTH] + # re = requests.Session.get(url, allow_redirects=False) + # url = re.headers['location'] + # append result + results.append({"url": url, "title": title, "content": abstract}) + except Exception as e: + raise SearxEngineException() + # return results + return results diff --git a/searx/settings.yml b/searx/settings.yml index b035da2a1..3cd1d7698 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -15,7 +15,7 @@ general: # expose stats in open metrics format at /metrics # leave empty to disable (no password set) # open_metrics: - open_metrics: '' + open_metrics: "" brand: new_issue_url: https://github.com/searxng/searxng/issues/new @@ -84,7 +84,7 @@ server: bind_address: "127.0.0.1" # public URL of the instance, to ensure correct inbound links. Is overwritten # by ${SEARXNG_URL}. - base_url: false # "http://example.com/location" + base_url: false # "http://example.com/location" # rate limit the number of request on the instance, block some bots. # Is overwritten by ${SEARXNG_LIMITER} limiter: false @@ -95,7 +95,7 @@ server: # If your instance owns a /etc/searxng/settings.yml file, then set the following # values there. - secret_key: "ultrasecretkey" # Is overwritten by ${SEARXNG_SECRET} + secret_key: "ultrasecretkey" # Is overwritten by ${SEARXNG_SECRET} # Proxy image results through SearXNG. Is overwritten by ${SEARXNG_IMAGE_PROXY} image_proxy: false # 1.0 and 1.1 are supported @@ -290,17 +290,17 @@ checker: lang: en result_container: - not_empty - - ['one_title_contains', 'citizen kane'] + - ["one_title_contains", "citizen kane"] test: - unique_results android: &test_android matrix: - query: ['android'] - lang: ['en', 'de', 'fr', 'zh-CN'] + query: ["android"] + lang: ["en", "de", "fr", "zh-CN"] result_container: - not_empty - - ['one_title_contains', 'google'] + - ["one_title_contains", "google"] test: - unique_results @@ -337,7 +337,8 @@ engines: categories: ["images"] # https://docs.searxng.org/dev/engines/online/adobe_stock.html adobe_order: relevance - adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"] + adobe_content_types: + ["photo", "illustration", "zip_vector", "template", "3d", "image"] timeout: 6 disabled: true @@ -439,6 +440,12 @@ engines: # engine: base # shortcut: bs + - name: baidu + engine: baidu + shortcut: bd + timeout: 2.0 + categories: general + - name: bandcamp engine: bandcamp shortcut: bc @@ -525,12 +532,12 @@ engines: engine: cloudflareai shortcut: cfai # get api token and accont id from https://developers.cloudflare.com/workers-ai/get-started/rest-api/ - cf_account_id: 'your_cf_accout_id' - cf_ai_api: 'your_cf_api' + cf_account_id: "your_cf_accout_id" + cf_ai_api: "your_cf_api" # create your ai gateway by https://developers.cloudflare.com/ai-gateway/get-started/creating-gateway/ - cf_ai_gateway: 'your_cf_ai_gateway_name' + cf_ai_gateway: "your_cf_ai_gateway_name" # find the model name from https://developers.cloudflare.com/workers-ai/models/#text-generation - cf_ai_model: 'ai_model_name' + cf_ai_model: "ai_model_name" # custom your preferences # cf_ai_model_display_name: 'Cloudflare AI' # cf_ai_model_assistant: 'prompts_for_assistant_role' @@ -601,7 +608,7 @@ engines: categories: general disabled: true paging: true - lang_all: '' + lang_all: "" search_url: https://curlie.org/search?q={query}&lang={lang}&start={pageno}&stime=92452189 page_size: 20 results_xpath: //div[@id="site-list-content"]/div[@class="site-item"] @@ -1654,32 +1661,32 @@ engines: - name: stackoverflow engine: stackexchange shortcut: st - api_site: 'stackoverflow' + api_site: "stackoverflow" categories: [it, q&a] - name: askubuntu engine: stackexchange shortcut: ubuntu - api_site: 'askubuntu' + api_site: "askubuntu" categories: [it, q&a] - name: superuser engine: stackexchange shortcut: su - api_site: 'superuser' + api_site: "superuser" categories: [it, q&a] - name: discuss.python engine: discourse shortcut: dpy - base_url: 'https://discuss.python.org' + base_url: "https://discuss.python.org" categories: [it, q&a] disabled: true - name: caddy.community engine: discourse shortcut: caddy - base_url: 'https://caddy.community' + base_url: "https://caddy.community" categories: [it, q&a] disabled: true @@ -1687,7 +1694,7 @@ engines: engine: discourse shortcut: pi categories: [it, q&a] - base_url: 'https://discourse.pi-hole.net' + base_url: "https://discourse.pi-hole.net" disabled: true - name: searchcode code @@ -1800,8 +1807,7 @@ engines: - name: torch engine: xpath paging: true - search_url: - http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and + search_url: http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and results_xpath: //table//tr url_xpath: ./td[2]/a title_xpath: ./td[2]/b @@ -1971,7 +1977,7 @@ engines: lang: en result_container: - not_empty - - ['one_title_contains', 'Tardigrada'] + - ["one_title_contains", "Tardigrada"] test: - unique_results @@ -2208,7 +2214,7 @@ engines: disabled: true # if you aren't using HTTPS for your local yacy instance disable https # enable_http: false - search_mode: 'global' + search_mode: "global" # timeout can be reduced in 'local' search mode timeout: 5.0 @@ -2256,7 +2262,7 @@ engines: no_result_for_http_status: [404] about: website: https://www.woxikon.de/ - wikidata_id: # No Wikidata ID + wikidata_id: # No Wikidata ID use_official_api: false require_api_key: false results: HTML @@ -2513,11 +2519,11 @@ engines: # keys: ['line'] doi_resolvers: - oadoi.org: 'https://oadoi.org/' - doi.org: 'https://doi.org/' - doai.io: 'https://dissem.in/' - sci-hub.se: 'https://sci-hub.se/' - sci-hub.st: 'https://sci-hub.st/' - sci-hub.ru: 'https://sci-hub.ru/' + oadoi.org: "https://oadoi.org/" + doi.org: "https://doi.org/" + doai.io: "https://dissem.in/" + sci-hub.se: "https://sci-hub.se/" + sci-hub.st: "https://sci-hub.st/" + sci-hub.ru: "https://sci-hub.ru/" -default_doi_resolver: 'oadoi.org' +default_doi_resolver: "oadoi.org"