add Baidu Engine

2025-02-22 09:55:43 -05:00 · 2024-12-10 20:40:48 +03:30 · 2024-12-10 20:40:48 +03:30 · 9cc537a106
commit 9cc537a106
parent 4f6c15f5c8
3 changed files with 194 additions and 31 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -20,3 +20,4 @@ msgspec==0.18.6
 eval_type_backport; python_version < '3.9'
 typer-slim==0.14.0
 isodate==0.7.2
+beautifulsoup4==4.12.2
--- a/searx/engines/baidu.py
+++ b/searx/engines/baidu.py
@ -0,0 +1,156 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+"""
+ Baidu
+"""
+
+from json import loads
+from urllib.parse import urlencode, urlparse
+from bs4 import BeautifulSoup
+from searx.exceptions import SearxEngineException
+
+# import requests
+# about
+about = {
+    "website": "https://github.com/",
+    "wikidata_id": "Q14772",
+    "use_official_api": False,
+    "require_api_key": False,
+    "results": "JSON",
+}
+
+# engine dependent config
+categories = ["general"]
+
+# search-url
+baidu_host_url = "https://www.baidu.com"
+baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&{query}"
+
+ABSTRACT_MAX_LENGTH = 500
+
+
+# do search-request
+def request(query, params):
+
+    offset = (params["pageno"] - 1) * 10
+    params["url"] = baidu_search_url.format(
+        query=urlencode({"wd": query, "pn": offset})
+    )
+    # headers
+    params["headers"][
+        "Accept"
+    ] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
+    params["headers"]["Accept-Language"] = "zh-CN,zh;q=0.9"
+    params["headers"]["Content-Type"] = "application/x-www-form-urlencoded"
+    params["headers"][
+        "User-Agent"
+    ] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
+    params["headers"]["Accept-Encoding"] = "gzip, deflate"
+    params["headers"]["Referer"] = "https://www.baidu.com/"
+
+    return params
+
+
+# get response from search-request
+def response(resp):
+    results = []
+    try:
+        resp.encoding = "utf-8"
+        root = BeautifulSoup(resp.text, "lxml")
+        div_contents = root.find("div", id="content_left")
+        for div in div_contents.contents:
+            if type(div) != type(div_contents):
+                continue
+
+            class_list = div.get("class", [])
+            if not class_list:
+                continue
+
+            if "c-container" not in class_list:
+                continue
+
+            title = ""
+            url = ""
+            abstract = ""
+            if "xpath-log" in class_list:
+                if div.h3:
+                    title = div.h3.text.strip()
+                    url = div.h3.a["href"].strip()
+                else:
+                    title = div.text.strip().split("\n", 1)[0]
+                    if div.a:
+                        url = div.a["href"].strip()
+
+                if div.find("div", class_="c-abstract"):
+                    abstract = div.find("div", class_="c-abstract").text.strip()
+                elif div.div:
+                    abstract = div.div.text.strip()
+                else:
+                    abstract = div.text.strip().split("\n", 1)[1].strip()
+            elif "result-op" in class_list:
+                if div.h3:
+                    title = div.h3.text.strip()
+                    url = div.h3.a["href"].strip()
+                else:
+                    title = div.text.strip().split("\n", 1)[0]
+                    url = div.a["href"].strip()
+                if div.find("div", class_="c-abstract"):
+                    abstract = div.find("div", class_="c-abstract").text.strip()
+                elif div.div:
+                    abstract = div.div.text.strip()
+                else:
+                    # abstract = div.text.strip()
+                    abstract = div.text.strip().split("\n", 1)[1].strip()
+            else:
+                if div.get("tpl", "") != "se_com_default":
+                    if div.get("tpl", "") == "se_st_com_abstract":
+                        if len(div.contents) >= 1:
+                            title = div.h3.text.strip()
+                            if div.find("div", class_="c-abstract"):
+                                abstract = div.find(
+                                    "div", class_="c-abstract"
+                                ).text.strip()
+                            elif div.div:
+                                abstract = div.div.text.strip()
+                            else:
+                                abstract = div.text.strip()
+                    else:
+                        if len(div.contents) >= 2:
+                            if div.h3:
+                                title = div.h3.text.strip()
+                                url = div.h3.a["href"].strip()
+                            else:
+                                title = div.contents[0].text.strip()
+                                url = div.h3.a["href"].strip()
+                            # abstract = div.contents[-1].text
+                            if div.find("div", class_="c-abstract"):
+                                abstract = div.find(
+                                    "div", class_="c-abstract"
+                                ).text.strip()
+                            elif div.div:
+                                abstract = div.div.text.strip()
+                            else:
+                                abstract = div.text.strip()
+                else:
+                    if div.h3:
+                        title = div.h3.text.strip()
+                        url = div.h3.a["href"].strip()
+                    else:
+                        title = div.contents[0].text.strip()
+                        url = div.h3.a["href"].strip()
+                    if div.find("div", class_="c-abstract"):
+                        abstract = div.find("div", class_="c-abstract").text.strip()
+                    elif div.div:
+                        abstract = div.div.text.strip()
+                    else:
+                        abstract = div.text.strip()
+
+            if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:
+                abstract = abstract[:ABSTRACT_MAX_LENGTH]
+            # re = requests.Session.get(url, allow_redirects=False)
+            # url = re.headers['location']
+            # append result
+            results.append({"url": url, "title": title, "content": abstract})
+    except Exception as e:
+        raise SearxEngineException()
+    # return results
+    return results
--- a/searx/settings.yml
+++ b/searx/settings.yml
@ -15,7 +15,7 @@ general:
  # expose stats in open metrics format at /metrics
  # leave empty to disable (no password set)
  # open_metrics: <password>
-  open_metrics: ''
+  open_metrics: ""

 brand:
  new_issue_url: https://github.com/searxng/searxng/issues/new
@ -84,7 +84,7 @@ server:
  bind_address: "127.0.0.1"
  # public URL of the instance, to ensure correct inbound links. Is overwritten
  # by ${SEARXNG_URL}.
-  base_url: false  # "http://example.com/location"
+  base_url: false # "http://example.com/location"
  # rate limit the number of request on the instance, block some bots.
  # Is overwritten by ${SEARXNG_LIMITER}
  limiter: false
@ -95,7 +95,7 @@ server:
  # If your instance owns a /etc/searxng/settings.yml file, then set the following
  # values there.

-  secret_key: "ultrasecretkey"  # Is overwritten by ${SEARXNG_SECRET}
+  secret_key: "ultrasecretkey" # Is overwritten by ${SEARXNG_SECRET}
  # Proxy image results through SearXNG. Is overwritten by ${SEARXNG_IMAGE_PROXY}
  image_proxy: false
  # 1.0 and 1.1 are supported
@ -290,17 +290,17 @@ checker:
        lang: en
      result_container:
        - not_empty
-        - ['one_title_contains', 'citizen kane']
+        - ["one_title_contains", "citizen kane"]
      test:
        - unique_results

    android: &test_android
      matrix:
-        query: ['android']
-        lang: ['en', 'de', 'fr', 'zh-CN']
+        query: ["android"]
+        lang: ["en", "de", "fr", "zh-CN"]
      result_container:
        - not_empty
-        - ['one_title_contains', 'google']
+        - ["one_title_contains", "google"]
      test:
        - unique_results

@ -337,7 +337,8 @@ engines:
    categories: ["images"]
    # https://docs.searxng.org/dev/engines/online/adobe_stock.html
    adobe_order: relevance
-    adobe_content_types: ["photo", "illustration", "zip_vector", "template", "3d", "image"]
+    adobe_content_types:
+      ["photo", "illustration", "zip_vector", "template", "3d", "image"]
    timeout: 6
    disabled: true

@ -439,6 +440,12 @@ engines:
  #   engine: base
  #   shortcut: bs

+  - name: baidu
+    engine: baidu
+    shortcut: bd
+    timeout: 2.0
+    categories: general
+
  - name: bandcamp
    engine: bandcamp
    shortcut: bc
@ -525,12 +532,12 @@ engines:
    engine: cloudflareai
    shortcut: cfai
    # get api token and accont id from https://developers.cloudflare.com/workers-ai/get-started/rest-api/
-    cf_account_id: 'your_cf_accout_id'
-    cf_ai_api: 'your_cf_api'
+    cf_account_id: "your_cf_accout_id"
+    cf_ai_api: "your_cf_api"
    # create your ai gateway by https://developers.cloudflare.com/ai-gateway/get-started/creating-gateway/
-    cf_ai_gateway: 'your_cf_ai_gateway_name'
+    cf_ai_gateway: "your_cf_ai_gateway_name"
    # find the model name from https://developers.cloudflare.com/workers-ai/models/#text-generation
-    cf_ai_model: 'ai_model_name'
+    cf_ai_model: "ai_model_name"
    # custom your preferences
    # cf_ai_model_display_name: 'Cloudflare AI'
    # cf_ai_model_assistant: 'prompts_for_assistant_role'
@ -601,7 +608,7 @@ engines:
    categories: general
    disabled: true
    paging: true
-    lang_all: ''
+    lang_all: ""
    search_url: https://curlie.org/search?q={query}&lang={lang}&start={pageno}&stime=92452189
    page_size: 20
    results_xpath: //div[@id="site-list-content"]/div[@class="site-item"]
@ -1654,32 +1661,32 @@ engines:
  - name: stackoverflow
    engine: stackexchange
    shortcut: st
-    api_site: 'stackoverflow'
+    api_site: "stackoverflow"
    categories: [it, q&a]

  - name: askubuntu
    engine: stackexchange
    shortcut: ubuntu
-    api_site: 'askubuntu'
+    api_site: "askubuntu"
    categories: [it, q&a]

  - name: superuser
    engine: stackexchange
    shortcut: su
-    api_site: 'superuser'
+    api_site: "superuser"
    categories: [it, q&a]

  - name: discuss.python
    engine: discourse
    shortcut: dpy
-    base_url: 'https://discuss.python.org'
+    base_url: "https://discuss.python.org"
    categories: [it, q&a]
    disabled: true

  - name: caddy.community
    engine: discourse
    shortcut: caddy
-    base_url: 'https://caddy.community'
+    base_url: "https://caddy.community"
    categories: [it, q&a]
    disabled: true

@ -1687,7 +1694,7 @@ engines:
    engine: discourse
    shortcut: pi
    categories: [it, q&a]
-    base_url: 'https://discourse.pi-hole.net'
+    base_url: "https://discourse.pi-hole.net"
    disabled: true

  - name: searchcode code
@ -1800,8 +1807,7 @@ engines:
  - name: torch
    engine: xpath
    paging: true
-    search_url:
-      http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and
+    search_url: http://xmh57jrknzkhv6y3ls3ubitzfqnkrwxhopf5aygthi7d6rplyvk3noyd.onion/cgi-bin/omega/omega?P={query}&DEFAULTOP=and
    results_xpath: //table//tr
    url_xpath: ./td[2]/a
    title_xpath: ./td[2]/b
@ -1971,7 +1977,7 @@ engines:
          lang: en
        result_container:
          - not_empty
-          - ['one_title_contains', 'Tardigrada']
+          - ["one_title_contains", "Tardigrada"]
        test:
          - unique_results

@ -2208,7 +2214,7 @@ engines:
    disabled: true
    # if you aren't using HTTPS for your local yacy instance disable https
    # enable_http: false
-    search_mode: 'global'
+    search_mode: "global"
    # timeout can be reduced in 'local' search mode
    timeout: 5.0

@ -2256,7 +2262,7 @@ engines:
    no_result_for_http_status: [404]
    about:
      website: https://www.woxikon.de/
-      wikidata_id:  # No Wikidata ID
+      wikidata_id: # No Wikidata ID
      use_official_api: false
      require_api_key: false
      results: HTML
@ -2513,11 +2519,11 @@ engines:
 #        keys: ['line']

 doi_resolvers:
-  oadoi.org: 'https://oadoi.org/'
-  doi.org: 'https://doi.org/'
-  doai.io: 'https://dissem.in/'
-  sci-hub.se: 'https://sci-hub.se/'
-  sci-hub.st: 'https://sci-hub.st/'
-  sci-hub.ru: 'https://sci-hub.ru/'
+  oadoi.org: "https://oadoi.org/"
+  doi.org: "https://doi.org/"
+  doai.io: "https://dissem.in/"
+  sci-hub.se: "https://sci-hub.se/"
+  sci-hub.st: "https://sci-hub.st/"
+  sci-hub.ru: "https://sci-hub.ru/"

-default_doi_resolver: 'oadoi.org'
+default_doi_resolver: "oadoi.org"