Gitee/searx/locales.py

# SPDX-License-Identifier: AGPL-3.0-or-later
"""
SearXNG’s locale data
=====================

The variables :py:obj:`RTL_LOCALES` and :py:obj:`LOCALE_NAMES` are loaded from
:origin:`searx/data/locales.json` / see :py:obj:`locales_initialize` and
:ref:`update_locales.py`.

.. hint::

   Whenever the value of :py:obj:`ADDITIONAL_TRANSLATIONS` or
   :py:obj:`LOCALE_BEST_MATCH` is modified, the
   :origin:`searx/data/locales.json` needs to be rebuild::

     ./manage data.locales

SearXNG's locale codes
======================

.. automodule:: searx.sxng_locales
   :members:


SearXNG’s locale implementations
================================
"""

from __future__ import annotations

from pathlib import Path

import babel
from babel.support import Translations
import babel.languages
import babel.core
import flask_babel
from flask.ctx import has_request_context

from searx import (
    data,
    logger,
    searx_dir,
)
from searx.extended_types import sxng_request

logger = logger.getChild('locales')


# safe before monkey patching flask_babel.get_translations
_flask_babel_get_translations = flask_babel.get_translations

LOCALE_NAMES = {}
"""Mapping of locales and their description.  Locales e.g. 'fr' or 'pt-BR' (see
:py:obj:`locales_initialize`).

:meta hide-value:
"""

RTL_LOCALES: set[str] = set()
"""List of *Right-To-Left* locales e.g. 'he' or 'fa-IR' (see
:py:obj:`locales_initialize`)."""

ADDITIONAL_TRANSLATIONS = {
    "dv": "ދިވެހި (Dhivehi)",
    "oc": "Occitan",
    "szl": "Ślōnski (Silesian)",
    "pap": "Papiamento",
}
"""Additional languages SearXNG has translations for but not supported by
python-babel (see :py:obj:`locales_initialize`)."""

LOCALE_BEST_MATCH = {
    "dv": "si",
    "oc": 'fr-FR',
    "szl": "pl",
    "nl-BE": "nl",
    "zh-HK": "zh-Hant-TW",
    "pap": "pt-BR",
}
"""Map a locale we do not have a translations for to a locale we have a
translation for.  By example: use Taiwan version of the translation for Hong
Kong."""


def localeselector():
    locale = 'en'
    if has_request_context():
        value = sxng_request.preferences.get_value('locale')
        if value:
            locale = value

    # first, set the language that is not supported by babel
    if locale in ADDITIONAL_TRANSLATIONS:
        sxng_request.form['use-translation'] = locale

    # second, map locale to a value python-babel supports
    locale = LOCALE_BEST_MATCH.get(locale, locale)

    if locale == '':
        # if there is an error loading the preferences
        # the locale is going to be ''
        locale = 'en'

    # babel uses underscore instead of hyphen.
    locale = locale.replace('-', '_')
    return locale


def get_translations():
    """Monkey patch of :py:obj:`flask_babel.get_translations`"""
    if has_request_context():
        use_translation = sxng_request.form.get('use-translation')
        if use_translation in ADDITIONAL_TRANSLATIONS:
            babel_ext = flask_babel.current_app.extensions['babel']
            return Translations.load(babel_ext.translation_directories[0], use_translation)
    return _flask_babel_get_translations()


_TR_LOCALES: list[str] = []


def get_translation_locales() -> list[str]:
    """Returns the list of translation locales (*underscore*).  The list is
    generated from the translation folders in :origin:`searx/translations`"""

    global _TR_LOCALES  # pylint:disable=global-statement
    if _TR_LOCALES:
        return _TR_LOCALES

    tr_locales = []
    for folder in (Path(searx_dir) / 'translations').iterdir():
        if not folder.is_dir():
            continue
        if not (folder / 'LC_MESSAGES').is_dir():
            continue
        tr_locales.append(folder.name)
    _TR_LOCALES = sorted(tr_locales)
    return _TR_LOCALES


def locales_initialize():
    """Initialize locales environment of the SearXNG session.

    - monkey patch :py:obj:`flask_babel.get_translations` by :py:obj:`get_translations`
    - init global names :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`
    """
    flask_babel.get_translations = get_translations
    LOCALE_NAMES.update(data.LOCALES["LOCALE_NAMES"])
    RTL_LOCALES.update(data.LOCALES["RTL_LOCALES"])


def region_tag(locale: babel.Locale) -> str:
    """Returns SearXNG's region tag from the locale (e.g. zh-TW , en-US)."""
    if not locale.territory:
        raise ValueError('babel.Locale %s: missed a territory' % locale)
    return locale.language + '-' + locale.territory


def language_tag(locale: babel.Locale) -> str:
    """Returns SearXNG's language tag from the locale and if exits, the tag
    includes the script name (e.g. en, zh_Hant).
    """
    sxng_lang = locale.language
    if locale.script:
        sxng_lang += '_' + locale.script
    return sxng_lang


def get_locale(locale_tag: str) -> babel.Locale | None:
    """Returns a :py:obj:`babel.Locale` object parsed from argument
    ``locale_tag``"""
    try:
        locale = babel.Locale.parse(locale_tag, sep='-')
        return locale

    except babel.core.UnknownLocaleError:
        return None


def get_official_locales(
    territory: str, languages=None, regional: bool = False, de_facto: bool = True
) -> set[babel.Locale]:
    """Returns a list of :py:obj:`babel.Locale` with languages from
    :py:obj:`babel.languages.get_official_languages`.

    :param territory: The territory (country or region) code.

    :param languages: A list of language codes the languages from
      :py:obj:`babel.languages.get_official_languages` should be in
      (intersection).  If this argument is ``None``, all official languages in
      this territory are used.

    :param regional: If the regional flag is set, then languages which are
      regionally official are also returned.

    :param de_facto: If the de_facto flag is set to `False`, then languages
      which are “de facto” official are not returned.

    """
    ret_val = set()
    o_languages = babel.languages.get_official_languages(territory, regional=regional, de_facto=de_facto)

    if languages:
        languages = [l.lower() for l in languages]
        o_languages = set(l for l in o_languages if l.lower() in languages)

    for lang in o_languages:
        try:
            locale = babel.Locale.parse(lang + '_' + territory)
            ret_val.add(locale)
        except babel.UnknownLocaleError:
            continue

    return ret_val


def get_engine_locale(searxng_locale, engine_locales, default=None):
    """Return engine's language (aka locale) string that best fits to argument
    ``searxng_locale``.

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*::

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
          ..
          'zh'             : 'zh'
          'zh_Hans'        : 'zh'
          'zh_Hant'        : 'zh_TW'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

    If there is no direct 1:1 mapping, this functions tries to narrow down
    engine's language (locale).  If no value can be determined by these
    approximation attempts the ``default`` value is returned.

    Assumptions:

    A. When user select a language the results should be optimized according to
       the selected language.

    B. When user select a language and a territory the results should be
       optimized with first priority on territory and second on language.

    First approximation rule (*by territory*):

      When the user selects a locale with territory (and a language), the
      territory has priority over the language.  If any of the official languages
      in the territory is supported by the engine (``engine_locales``) it will
      be used.

    Second approximation rule (*by language*):

      If "First approximation rule" brings no result or the user selects only a
      language without a territory.  Check in which territories the language
      has an official status and if one of these territories is supported by the
      engine.

    """
    # pylint: disable=too-many-branches, too-many-return-statements

    engine_locale = engine_locales.get(searxng_locale)

    if engine_locale is not None:
        # There was a 1:1 mapping (e.g. a region "fr-BE --> fr_BE" or a language
        # "zh --> zh"), no need to narrow language-script nor territory.
        return engine_locale

    try:
        locale = babel.Locale.parse(searxng_locale, sep='-')
    except babel.core.UnknownLocaleError:
        try:
            locale = babel.Locale.parse(searxng_locale.split('-')[0])
        except babel.core.UnknownLocaleError:
            return default

    searxng_lang = language_tag(locale)
    engine_locale = engine_locales.get(searxng_lang)
    if engine_locale is not None:
        # There was a 1:1 mapping (e.g. "zh-HK --> zh_Hant" or "zh-CN --> zh_Hans")
        return engine_locale

    # SearXNG's selected locale is not supported by the engine ..

    if locale.territory:
        # Try to narrow by *official* languages in the territory (??-XX).

        for official_language in babel.languages.get_official_languages(locale.territory, de_facto=True):
            searxng_locale = official_language + '-' + locale.territory
            engine_locale = engine_locales.get(searxng_locale)
            if engine_locale is not None:
                return engine_locale

    # Engine does not support one of the official languages in the territory or
    # there is only a language selected without a territory.

    # Now lets have a look if the searxng_lang (the language selected by the
    # user) is a official language in other territories.  If so, check if
    # engine does support the searxng_lang in this other territory.

    if locale.language:

        terr_lang_dict = {}
        for territory, langs in babel.core.get_global("territory_languages").items():
            if not langs.get(searxng_lang, {}).get('official_status'):
                continue
            terr_lang_dict[territory] = langs.get(searxng_lang)

        # first: check fr-FR, de-DE .. is supported by the engine
        # exception: 'en' --> 'en-US'

        territory = locale.language.upper()
        if territory == 'EN':
            territory = 'US'

        if terr_lang_dict.get(territory):
            searxng_locale = locale.language + '-' + territory
            engine_locale = engine_locales.get(searxng_locale)
            if engine_locale is not None:
                return engine_locale

        # second: sort by population_percent and take first match

        # drawback of "population percent": if there is a territory with a
        #   small number of people (e.g 100) but the majority speaks the
        #   language, then the percentage might be 100% (--> 100 people) but in
        #   a different territory with more people (e.g. 10.000) where only 10%
        #   speak the language the total amount of speaker is higher (--> 200
        #   people).
        #
        #   By example: The population of Saint-Martin is 33.000, of which 100%
        #   speak French, but this is less than the 30% of the approximately 2.5
        #   million Belgian citizens
        #
        #   - 'fr-MF', 'population_percent': 100.0, 'official_status': 'official'
        #   - 'fr-BE', 'population_percent': 38.0, 'official_status': 'official'

        terr_lang_list = []
        for k, v in terr_lang_dict.items():
            terr_lang_list.append((k, v))

        for territory, _lang in sorted(terr_lang_list, key=lambda item: item[1]['population_percent'], reverse=True):
            searxng_locale = locale.language + '-' + territory
            engine_locale = engine_locales.get(searxng_locale)
            if engine_locale is not None:
                return engine_locale

    # No luck: narrow by "language from territory" and "territory from language"
    # does not fit to a locale supported by the engine.

    if engine_locale is None:
        engine_locale = default

    return default


def match_locale(searxng_locale: str, locale_tag_list: list[str], fallback: str | None = None) -> str | None:
    """Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``.

    :param str searxng_locale: SearXNG's internal representation of locale (de,
        de-DE, fr-BE, zh, zh-CN, zh-TW ..).

    :param list locale_tag_list: The list of locale tags to select from

    :param str fallback: fallback locale tag (if unset --> ``None``)

    The rules to find a match are implemented in :py:obj:`get_engine_locale`,
    the ``engine_locales`` is build up by :py:obj:`build_engine_locales`.

    .. hint::

       The *SearXNG locale* string and the members of ``locale_tag_list`` has to
       be known by babel!  The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the
       UI and are not known by babel --> will be ignored.
    """

    # searxng_locale = 'es'
    # locale_tag_list = ['es-AR', 'es-ES', 'es-MX']

    if not searxng_locale:
        return fallback

    locale = get_locale(searxng_locale)
    if locale is None:
        return fallback

    # normalize to a SearXNG locale that can be passed to get_engine_locale

    searxng_locale = language_tag(locale)
    if locale.territory:
        searxng_locale = region_tag(locale)

    # clean up locale_tag_list

    tag_list = []
    for tag in locale_tag_list:
        if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS:
            continue
        tag_list.append(tag)

    # emulate fetch_traits
    engine_locales = build_engine_locales(tag_list)
    return get_engine_locale(searxng_locale, engine_locales, default=fallback)


def build_engine_locales(tag_list: list[str]):
    """From a list of locale tags a dictionary is build that can be passed by
    argument ``engine_locales`` to :py:obj:`get_engine_locale`.  This function
    is mainly used by :py:obj:`match_locale` and is similar to what the
    ``fetch_traits(..)`` function of engines do.

    If there are territory codes in the ``tag_list`` that have a *script code*
    additional keys are added to the returned dictionary.

    .. code:: python

       >>> import locales
       >>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW'])
       >>> engine_locales
       {
           'en': 'en', 'en-US': 'en-US',
           'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN',
           'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW'
       }
       >>> get_engine_locale('zh-Hans', engine_locales)
       'zh-CN'

    This function is a good example to understand the language/region model
    of SearXNG:

      SearXNG only distinguishes between **search languages** and **search
      regions**, by adding the *script-tags*, languages with *script-tags* can
      be assigned to the **regions** that SearXNG supports.

    """
    engine_locales = {}

    for tag in tag_list:
        locale = get_locale(tag)
        if locale is None:
            logger.warning("build_engine_locales: skip locale tag %s / unknown by babel", tag)
            continue
        if locale.territory:
            engine_locales[region_tag(locale)] = tag
            if locale.script:
                engine_locales[language_tag(locale)] = tag
        else:
            engine_locales[language_tag(locale)] = tag
    return engine_locales
-												[mod] pylint & document searx.locales (settings.yml: remove locales)

- Add ``# lint: pylint`` header to pylint this python file.
- Fix issues reported by pylint.
- Add source code documentation of modul searx.locales

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-08-03 18:17:23 +02:00
+								# SPDX-License-Identifier: AGPL-3.0-or-later
-												[mod] locale: use hyphen everywhere except for Babel

											
										
										
											2021-10-06 10:26:40 +02:00
+								"""
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								SearXNG’s locale data
 								=====================
-												[mod] pylint & document searx.locales (settings.yml: remove locales)

- Add ``# lint: pylint`` header to pylint this python file.
- Fix issues reported by pylint.
- Add source code documentation of modul searx.locales

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-08-03 18:17:23 +02:00
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								The variables :py:obj:`RTL_LOCALES` and :py:obj:`LOCALE_NAMES` are loaded from
 								:origin:`searx/data/locales.json` / see :py:obj:`locales_initialize` and
 								:ref:`update_locales.py`.
 								.. hint::
 								   Whenever the value of :py:obj:`ADDITIONAL_TRANSLATIONS` or
 								   :py:obj:`LOCALE_BEST_MATCH` is modified, the
 								   :origin:`searx/data/locales.json` needs to be rebuild::
 								     ./manage data.locales
 								SearXNG's locale codes
 								======================
 								.. automodule:: searx.sxng_locales
 								   :members:
 								SearXNG’s locale implementations
 								================================
 								"""
 								from __future__ import annotations
 								from pathlib import Path
-												[mod] settings.yml: remove locales

There are detected from the searx/translations directory

											
										
										
											2021-08-03 15:13:00 +02:00
-												[mod] replace engines_languages.json by engines_traits.json

Implementations of the *traits* of the engines.

Engine's traits are fetched from the origin engine and stored in a JSON file in
the *data folder*.  Most often traits are languages and region codes and their
mapping from SearXNG's representation to the representation in the origin search
engine.

To load traits from the persistence::

    searx.enginelib.traits.EngineTraitsMap.from_data()

For new traits new properties can be added to the class::

    searx.enginelib.traits.EngineTraits

.. hint::

   Implementation is downward compatible to the deprecated *supported_languages
   method* from the vintage implementation.

   The vintage code is tagged as *deprecated* an can be removed when all engines
   has been ported to the *traits method*.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-29 20:54:46 +02:00
+								import babel
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								from babel.support import Translations
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								import babel.languages
 								import babel.core
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								import flask_babel
 								from flask.ctx import has_request_context
-												[refactor] typification of SearXNG (initial) / result items (part 1)

Typification of SearXNG
=======================

This patch introduces the typing of the results.  The why and how is described
in the documentation, please generate the documentation ..

    $ make docs.clean docs.live

and read the following articles in the "Developer documentation":

- result types --> http://0.0.0.0:8000/dev/result_types/index.html

The result types are available from the `searx.result_types` module.  The
following have been implemented so far:

- base result type: `searx.result_type.Result`
  --> http://0.0.0.0:8000/dev/result_types/base_result.html

- answer results
  --> http://0.0.0.0:8000/dev/result_types/answer.html

including the type for translations (inspired by #3925).  For all other
types (which still need to be set up in subsequent PRs), template documentation
has been created for the transition period.

Doc of the fields used in Templates
===================================

The template documentation is the basis for the typing and is the first complete
documentation of the results (needed for engine development).  It is the
"working paper" (the plan) with which further typifications can be implemented
in subsequent PRs.

- https://github.com/searxng/searxng/issues/357

Answer Templates
================

With the new (sub) types for `Answer`, the templates for the answers have also
been revised, `Translation` are now displayed with collapsible entries (inspired
by #3925).

    !en-de dog

Plugins & Answerer
==================

The implementation for `Plugin` and `Answer` has been revised, see
documentation:

- Plugin: http://0.0.0.0:8000/dev/plugins/index.html
- Answerer: http://0.0.0.0:8000/dev/answerers/index.html

With `AnswerStorage` and `AnswerStorage` to manage those items (in follow up
PRs, `ArticleStorage`, `InfoStorage` and .. will be implemented)

Autocomplete
============

The autocompletion had a bug where the results from `Answer` had not been shown
in the past.  To test activate autocompletion and try search terms for which we
have answerers

- statistics: type `min 1 2 3` .. in the completion list you should find an
  entry like `[de] min(1, 2, 3) = 1`

- random: type `random uuid` .. in the completion list, the first item is a
  random UUID

Extended Types
==============

SearXNG extends e.g. the request and response types of flask and httpx, a module
has been set up for type extensions:

- Extended Types
  --> http://0.0.0.0:8000/dev/extended_types.html

Unit-Tests
==========

The unit tests have been completely revised.  In the previous implementation,
the runtime (the global variables such as `searx.settings`) was not initialized
before each test, so the runtime environment with which a test ran was always
determined by the tests that ran before it.  This was also the reason why we
sometimes had to observe non-deterministic errors in the tests in the past:

- https://github.com/searxng/searxng/issues/2988 is one example for the Runtime
  issues, with non-deterministic behavior ..

- https://github.com/searxng/searxng/pull/3650
- https://github.com/searxng/searxng/pull/3654
- https://github.com/searxng/searxng/pull/3642#issuecomment-2226884469
- https://github.com/searxng/searxng/pull/3746#issuecomment-2300965005

Why msgspec.Struct
==================

We have already discussed typing based on e.g. `TypeDict` or `dataclass` in the past:

- https://github.com/searxng/searxng/pull/1562/files
- https://gist.github.com/dalf/972eb05e7a9bee161487132a7de244d2
- https://github.com/searxng/searxng/pull/1412/files
- https://github.com/searxng/searxng/pull/1356

In my opinion, TypeDict is unsuitable because the objects are still dictionaries
and not instances of classes / the `dataclass` are classes but ...

The `msgspec.Struct` combine the advantages of typing, runtime behaviour and
also offer the option of (fast) serializing (incl. type check) the objects.

Currently not possible but conceivable with `msgspec`: Outsourcing the engines
into separate processes, what possibilities this opens up in the future is left
to the imagination!

Internally, we have already defined that it is desirable to decouple the
development of the engines from the development of the SearXNG core / The
serialization of the `Result` objects is a prerequisite for this.

HINT: The threads listed above were the template for this PR, even though the
implementation here is based on msgspec.  They should also be an inspiration for
the following PRs of typification, as the models and implementations can provide
a good direction.

Why just one commit?
====================

I tried to create several (thematically separated) commits, but gave up at some
point ... there are too many things to tackle at once / The comprehensibility of
the commits would not be improved by a thematic separation. On the contrary, we
would have to make multiple changes at the same places and the goal of a change
would be vaguely recognizable in the fog of the commits.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-12-15 09:59:50 +01:00
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								from searx import (
 								    data,
 								    logger,
 								    searx_dir,
 								)
-												[refactor] typification of SearXNG (initial) / result items (part 1)

Typification of SearXNG
=======================

This patch introduces the typing of the results.  The why and how is described
in the documentation, please generate the documentation ..

    $ make docs.clean docs.live

and read the following articles in the "Developer documentation":

- result types --> http://0.0.0.0:8000/dev/result_types/index.html

The result types are available from the `searx.result_types` module.  The
following have been implemented so far:

- base result type: `searx.result_type.Result`
  --> http://0.0.0.0:8000/dev/result_types/base_result.html

- answer results
  --> http://0.0.0.0:8000/dev/result_types/answer.html

including the type for translations (inspired by #3925).  For all other
types (which still need to be set up in subsequent PRs), template documentation
has been created for the transition period.

Doc of the fields used in Templates
===================================

The template documentation is the basis for the typing and is the first complete
documentation of the results (needed for engine development).  It is the
"working paper" (the plan) with which further typifications can be implemented
in subsequent PRs.

- https://github.com/searxng/searxng/issues/357

Answer Templates
================

With the new (sub) types for `Answer`, the templates for the answers have also
been revised, `Translation` are now displayed with collapsible entries (inspired
by #3925).

    !en-de dog

Plugins & Answerer
==================

The implementation for `Plugin` and `Answer` has been revised, see
documentation:

- Plugin: http://0.0.0.0:8000/dev/plugins/index.html
- Answerer: http://0.0.0.0:8000/dev/answerers/index.html

With `AnswerStorage` and `AnswerStorage` to manage those items (in follow up
PRs, `ArticleStorage`, `InfoStorage` and .. will be implemented)

Autocomplete
============

The autocompletion had a bug where the results from `Answer` had not been shown
in the past.  To test activate autocompletion and try search terms for which we
have answerers

- statistics: type `min 1 2 3` .. in the completion list you should find an
  entry like `[de] min(1, 2, 3) = 1`

- random: type `random uuid` .. in the completion list, the first item is a
  random UUID

Extended Types
==============

SearXNG extends e.g. the request and response types of flask and httpx, a module
has been set up for type extensions:

- Extended Types
  --> http://0.0.0.0:8000/dev/extended_types.html

Unit-Tests
==========

The unit tests have been completely revised.  In the previous implementation,
the runtime (the global variables such as `searx.settings`) was not initialized
before each test, so the runtime environment with which a test ran was always
determined by the tests that ran before it.  This was also the reason why we
sometimes had to observe non-deterministic errors in the tests in the past:

- https://github.com/searxng/searxng/issues/2988 is one example for the Runtime
  issues, with non-deterministic behavior ..

- https://github.com/searxng/searxng/pull/3650
- https://github.com/searxng/searxng/pull/3654
- https://github.com/searxng/searxng/pull/3642#issuecomment-2226884469
- https://github.com/searxng/searxng/pull/3746#issuecomment-2300965005

Why msgspec.Struct
==================

We have already discussed typing based on e.g. `TypeDict` or `dataclass` in the past:

- https://github.com/searxng/searxng/pull/1562/files
- https://gist.github.com/dalf/972eb05e7a9bee161487132a7de244d2
- https://github.com/searxng/searxng/pull/1412/files
- https://github.com/searxng/searxng/pull/1356

In my opinion, TypeDict is unsuitable because the objects are still dictionaries
and not instances of classes / the `dataclass` are classes but ...

The `msgspec.Struct` combine the advantages of typing, runtime behaviour and
also offer the option of (fast) serializing (incl. type check) the objects.

Currently not possible but conceivable with `msgspec`: Outsourcing the engines
into separate processes, what possibilities this opens up in the future is left
to the imagination!

Internally, we have already defined that it is desirable to decouple the
development of the engines from the development of the SearXNG core / The
serialization of the `Result` objects is a prerequisite for this.

HINT: The threads listed above were the template for this PR, even though the
implementation here is based on msgspec.  They should also be an inspiration for
the following PRs of typification, as the models and implementations can provide
a good direction.

Why just one commit?
====================

I tried to create several (thematically separated) commits, but gave up at some
point ... there are too many things to tackle at once / The comprehensibility of
the commits would not be improved by a thematic separation. On the contrary, we
would have to make multiple changes at the same places and the goal of a change
would be vaguely recognizable in the fog of the commits.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-12-15 09:59:50 +01:00
+								from searx.extended_types import sxng_request
-												[mod] settings.yml: remove locales

There are detected from the searx/translations directory

											
										
										
											2021-08-03 15:13:00 +02:00
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								logger = logger.getChild('locales')
 								# safe before monkey patching flask_babel.get_translations
 								_flask_babel_get_translations = flask_babel.get_translations
 								LOCALE_NAMES = {}
 								"""Mapping of locales and their description.  Locales e.g. 'fr' or 'pt-BR' (see
-												[fix] and improve docs generated from source code.

Fix::

    searx/locales.py:docstring of searx.locales.get_engine_locale:17: \
      WARNING: Definition list ends without a blank line; unexpected unindent.

Improvement: don't show default values in the generated documentation whe it is
more a mess than a usefull information (`:meta hide-value:`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-18 12:44:12 +02:00
+								:py:obj:`locales_initialize`).
 								:meta hide-value:
 								"""
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								RTL_LOCALES: set[str] = set()
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								"""List of *Right-To-Left* locales e.g. 'he' or 'fa-IR' (see
 								:py:obj:`locales_initialize`)."""
 								ADDITIONAL_TRANSLATIONS = {
-												searx.locale: add Dhivehi language

											
										
										
											2022-11-04 16:49:43 +00:00
+								    "dv": "ދިވެހި (Dhivehi)",
-												[mod] searx/locales.py: language names based on Unicode CLDR

rename "oc" to "Occitan":
* https://github.com/unicode-org/cldr/blob/35.1/seed/main/oc.xml#L115
* https://oc.wikipedia.org/wiki/Occitan

see https://github.com/searxng/searxng/pull/247#issuecomment-892382001

											
										
										
											2021-08-04 09:50:34 +02:00
+								    "oc": "Occitan",
-												Add support for the Silesian language

											
										
										
											2022-05-06 09:40:45 +00:00
+								    "szl": "Ślōnski (Silesian)",
-												locales.py: add support for Papiamento

											
										
										
											2022-07-08 10:00:20 +02:00
+								    "pap": "Papiamento",
-												[mod] settings.yml: remove locales

There are detected from the searx/translations directory

											
										
										
											2021-08-03 15:13:00 +02:00
+								}
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								"""Additional languages SearXNG has translations for but not supported by
 								python-babel (see :py:obj:`locales_initialize`)."""
-												[mod] pylint & document searx.locales (settings.yml: remove locales)

- Add ``# lint: pylint`` header to pylint this python file.
- Fix issues reported by pylint.
- Add source code documentation of modul searx.locales

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-08-03 18:17:23 +02:00
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								LOCALE_BEST_MATCH = {
-												searx.locale: add Dhivehi language

											
										
										
											2022-11-04 16:49:43 +00:00
+								    "dv": "si",
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								    "oc": 'fr-FR',
 								    "szl": "pl",
 								    "nl-BE": "nl",
 								    "zh-HK": "zh-Hant-TW",
-												locales.py: add support for Papiamento

											
										
										
											2022-07-08 10:00:20 +02:00
+								    "pap": "pt-BR",
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								}
 								"""Map a locale we do not have a translations for to a locale we have a
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								translation for.  By example: use Taiwan version of the translation for Hong
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								Kong."""
-												[mod] settings.yml: remove locales

There are detected from the searx/translations directory

											
										
										
											2021-08-03 15:13:00 +02:00
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								def localeselector():
 								    locale = 'en'
 								    if has_request_context():
-												[refactor] typification of SearXNG (initial) / result items (part 1)

Typification of SearXNG
=======================

This patch introduces the typing of the results.  The why and how is described
in the documentation, please generate the documentation ..

    $ make docs.clean docs.live

and read the following articles in the "Developer documentation":

- result types --> http://0.0.0.0:8000/dev/result_types/index.html

The result types are available from the `searx.result_types` module.  The
following have been implemented so far:

- base result type: `searx.result_type.Result`
  --> http://0.0.0.0:8000/dev/result_types/base_result.html

- answer results
  --> http://0.0.0.0:8000/dev/result_types/answer.html

including the type for translations (inspired by #3925).  For all other
types (which still need to be set up in subsequent PRs), template documentation
has been created for the transition period.

Doc of the fields used in Templates
===================================

The template documentation is the basis for the typing and is the first complete
documentation of the results (needed for engine development).  It is the
"working paper" (the plan) with which further typifications can be implemented
in subsequent PRs.

- https://github.com/searxng/searxng/issues/357

Answer Templates
================

With the new (sub) types for `Answer`, the templates for the answers have also
been revised, `Translation` are now displayed with collapsible entries (inspired
by #3925).

    !en-de dog

Plugins & Answerer
==================

The implementation for `Plugin` and `Answer` has been revised, see
documentation:

- Plugin: http://0.0.0.0:8000/dev/plugins/index.html
- Answerer: http://0.0.0.0:8000/dev/answerers/index.html

With `AnswerStorage` and `AnswerStorage` to manage those items (in follow up
PRs, `ArticleStorage`, `InfoStorage` and .. will be implemented)

Autocomplete
============

The autocompletion had a bug where the results from `Answer` had not been shown
in the past.  To test activate autocompletion and try search terms for which we
have answerers

- statistics: type `min 1 2 3` .. in the completion list you should find an
  entry like `[de] min(1, 2, 3) = 1`

- random: type `random uuid` .. in the completion list, the first item is a
  random UUID

Extended Types
==============

SearXNG extends e.g. the request and response types of flask and httpx, a module
has been set up for type extensions:

- Extended Types
  --> http://0.0.0.0:8000/dev/extended_types.html

Unit-Tests
==========

The unit tests have been completely revised.  In the previous implementation,
the runtime (the global variables such as `searx.settings`) was not initialized
before each test, so the runtime environment with which a test ran was always
determined by the tests that ran before it.  This was also the reason why we
sometimes had to observe non-deterministic errors in the tests in the past:

- https://github.com/searxng/searxng/issues/2988 is one example for the Runtime
  issues, with non-deterministic behavior ..

- https://github.com/searxng/searxng/pull/3650
- https://github.com/searxng/searxng/pull/3654
- https://github.com/searxng/searxng/pull/3642#issuecomment-2226884469
- https://github.com/searxng/searxng/pull/3746#issuecomment-2300965005

Why msgspec.Struct
==================

We have already discussed typing based on e.g. `TypeDict` or `dataclass` in the past:

- https://github.com/searxng/searxng/pull/1562/files
- https://gist.github.com/dalf/972eb05e7a9bee161487132a7de244d2
- https://github.com/searxng/searxng/pull/1412/files
- https://github.com/searxng/searxng/pull/1356

In my opinion, TypeDict is unsuitable because the objects are still dictionaries
and not instances of classes / the `dataclass` are classes but ...

The `msgspec.Struct` combine the advantages of typing, runtime behaviour and
also offer the option of (fast) serializing (incl. type check) the objects.

Currently not possible but conceivable with `msgspec`: Outsourcing the engines
into separate processes, what possibilities this opens up in the future is left
to the imagination!

Internally, we have already defined that it is desirable to decouple the
development of the engines from the development of the SearXNG core / The
serialization of the `Result` objects is a prerequisite for this.

HINT: The threads listed above were the template for this PR, even though the
implementation here is based on msgspec.  They should also be an inspiration for
the following PRs of typification, as the models and implementations can provide
a good direction.

Why just one commit?
====================

I tried to create several (thematically separated) commits, but gave up at some
point ... there are too many things to tackle at once / The comprehensibility of
the commits would not be improved by a thematic separation. On the contrary, we
would have to make multiple changes at the same places and the goal of a change
would be vaguely recognizable in the fog of the commits.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-12-15 09:59:50 +01:00
+								        value = sxng_request.preferences.get_value('locale')
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								        if value:
 								            locale = value
 								    # first, set the language that is not supported by babel
 								    if locale in ADDITIONAL_TRANSLATIONS:
-												[refactor] typification of SearXNG (initial) / result items (part 1)

Typification of SearXNG
=======================

This patch introduces the typing of the results.  The why and how is described
in the documentation, please generate the documentation ..

    $ make docs.clean docs.live

and read the following articles in the "Developer documentation":

- result types --> http://0.0.0.0:8000/dev/result_types/index.html

The result types are available from the `searx.result_types` module.  The
following have been implemented so far:

- base result type: `searx.result_type.Result`
  --> http://0.0.0.0:8000/dev/result_types/base_result.html

- answer results
  --> http://0.0.0.0:8000/dev/result_types/answer.html

including the type for translations (inspired by #3925).  For all other
types (which still need to be set up in subsequent PRs), template documentation
has been created for the transition period.

Doc of the fields used in Templates
===================================

The template documentation is the basis for the typing and is the first complete
documentation of the results (needed for engine development).  It is the
"working paper" (the plan) with which further typifications can be implemented
in subsequent PRs.

- https://github.com/searxng/searxng/issues/357

Answer Templates
================

With the new (sub) types for `Answer`, the templates for the answers have also
been revised, `Translation` are now displayed with collapsible entries (inspired
by #3925).

    !en-de dog

Plugins & Answerer
==================

The implementation for `Plugin` and `Answer` has been revised, see
documentation:

- Plugin: http://0.0.0.0:8000/dev/plugins/index.html
- Answerer: http://0.0.0.0:8000/dev/answerers/index.html

With `AnswerStorage` and `AnswerStorage` to manage those items (in follow up
PRs, `ArticleStorage`, `InfoStorage` and .. will be implemented)

Autocomplete
============

The autocompletion had a bug where the results from `Answer` had not been shown
in the past.  To test activate autocompletion and try search terms for which we
have answerers

- statistics: type `min 1 2 3` .. in the completion list you should find an
  entry like `[de] min(1, 2, 3) = 1`

- random: type `random uuid` .. in the completion list, the first item is a
  random UUID

Extended Types
==============

SearXNG extends e.g. the request and response types of flask and httpx, a module
has been set up for type extensions:

- Extended Types
  --> http://0.0.0.0:8000/dev/extended_types.html

Unit-Tests
==========

The unit tests have been completely revised.  In the previous implementation,
the runtime (the global variables such as `searx.settings`) was not initialized
before each test, so the runtime environment with which a test ran was always
determined by the tests that ran before it.  This was also the reason why we
sometimes had to observe non-deterministic errors in the tests in the past:

- https://github.com/searxng/searxng/issues/2988 is one example for the Runtime
  issues, with non-deterministic behavior ..

- https://github.com/searxng/searxng/pull/3650
- https://github.com/searxng/searxng/pull/3654
- https://github.com/searxng/searxng/pull/3642#issuecomment-2226884469
- https://github.com/searxng/searxng/pull/3746#issuecomment-2300965005

Why msgspec.Struct
==================

We have already discussed typing based on e.g. `TypeDict` or `dataclass` in the past:

- https://github.com/searxng/searxng/pull/1562/files
- https://gist.github.com/dalf/972eb05e7a9bee161487132a7de244d2
- https://github.com/searxng/searxng/pull/1412/files
- https://github.com/searxng/searxng/pull/1356

In my opinion, TypeDict is unsuitable because the objects are still dictionaries
and not instances of classes / the `dataclass` are classes but ...

The `msgspec.Struct` combine the advantages of typing, runtime behaviour and
also offer the option of (fast) serializing (incl. type check) the objects.

Currently not possible but conceivable with `msgspec`: Outsourcing the engines
into separate processes, what possibilities this opens up in the future is left
to the imagination!

Internally, we have already defined that it is desirable to decouple the
development of the engines from the development of the SearXNG core / The
serialization of the `Result` objects is a prerequisite for this.

HINT: The threads listed above were the template for this PR, even though the
implementation here is based on msgspec.  They should also be an inspiration for
the following PRs of typification, as the models and implementations can provide
a good direction.

Why just one commit?
====================

I tried to create several (thematically separated) commits, but gave up at some
point ... there are too many things to tackle at once / The comprehensibility of
the commits would not be improved by a thematic separation. On the contrary, we
would have to make multiple changes at the same places and the goal of a change
would be vaguely recognizable in the fog of the commits.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-12-15 09:59:50 +01:00
+								        sxng_request.form['use-translation'] = locale
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
 								    # second, map locale to a value python-babel supports
 								    locale = LOCALE_BEST_MATCH.get(locale, locale)
 								    if locale == '':
 								        # if there is an error loading the preferences
 								        # the locale is going to be ''
 								        locale = 'en'
 								    # babel uses underscore instead of hyphen.
 								    locale = locale.replace('-', '_')
 								    return locale
 								def get_translations():
-												[doc] fix some leftovers from ad964562c

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-14 16:31:41 +02:00
+								    """Monkey patch of :py:obj:`flask_babel.get_translations`"""
-												searx.locales: improve support for languages not supported by babel

* refactor get_translations() to rely on ADDITIONAL_TRANSLATIONS and LOCALE_BEST_MATCH
* update RTL_LOCALES for languages in ADDITIONAL_TRANSLATIONS

											
										
										
											2022-11-04 16:47:02 +00:00
+								    if has_request_context():
-												[refactor] typification of SearXNG (initial) / result items (part 1)

Typification of SearXNG
=======================

This patch introduces the typing of the results.  The why and how is described
in the documentation, please generate the documentation ..

    $ make docs.clean docs.live

and read the following articles in the "Developer documentation":

- result types --> http://0.0.0.0:8000/dev/result_types/index.html

The result types are available from the `searx.result_types` module.  The
following have been implemented so far:

- base result type: `searx.result_type.Result`
  --> http://0.0.0.0:8000/dev/result_types/base_result.html

- answer results
  --> http://0.0.0.0:8000/dev/result_types/answer.html

including the type for translations (inspired by #3925).  For all other
types (which still need to be set up in subsequent PRs), template documentation
has been created for the transition period.

Doc of the fields used in Templates
===================================

The template documentation is the basis for the typing and is the first complete
documentation of the results (needed for engine development).  It is the
"working paper" (the plan) with which further typifications can be implemented
in subsequent PRs.

- https://github.com/searxng/searxng/issues/357

Answer Templates
================

With the new (sub) types for `Answer`, the templates for the answers have also
been revised, `Translation` are now displayed with collapsible entries (inspired
by #3925).

    !en-de dog

Plugins & Answerer
==================

The implementation for `Plugin` and `Answer` has been revised, see
documentation:

- Plugin: http://0.0.0.0:8000/dev/plugins/index.html
- Answerer: http://0.0.0.0:8000/dev/answerers/index.html

With `AnswerStorage` and `AnswerStorage` to manage those items (in follow up
PRs, `ArticleStorage`, `InfoStorage` and .. will be implemented)

Autocomplete
============

The autocompletion had a bug where the results from `Answer` had not been shown
in the past.  To test activate autocompletion and try search terms for which we
have answerers

- statistics: type `min 1 2 3` .. in the completion list you should find an
  entry like `[de] min(1, 2, 3) = 1`

- random: type `random uuid` .. in the completion list, the first item is a
  random UUID

Extended Types
==============

SearXNG extends e.g. the request and response types of flask and httpx, a module
has been set up for type extensions:

- Extended Types
  --> http://0.0.0.0:8000/dev/extended_types.html

Unit-Tests
==========

The unit tests have been completely revised.  In the previous implementation,
the runtime (the global variables such as `searx.settings`) was not initialized
before each test, so the runtime environment with which a test ran was always
determined by the tests that ran before it.  This was also the reason why we
sometimes had to observe non-deterministic errors in the tests in the past:

- https://github.com/searxng/searxng/issues/2988 is one example for the Runtime
  issues, with non-deterministic behavior ..

- https://github.com/searxng/searxng/pull/3650
- https://github.com/searxng/searxng/pull/3654
- https://github.com/searxng/searxng/pull/3642#issuecomment-2226884469
- https://github.com/searxng/searxng/pull/3746#issuecomment-2300965005

Why msgspec.Struct
==================

We have already discussed typing based on e.g. `TypeDict` or `dataclass` in the past:

- https://github.com/searxng/searxng/pull/1562/files
- https://gist.github.com/dalf/972eb05e7a9bee161487132a7de244d2
- https://github.com/searxng/searxng/pull/1412/files
- https://github.com/searxng/searxng/pull/1356

In my opinion, TypeDict is unsuitable because the objects are still dictionaries
and not instances of classes / the `dataclass` are classes but ...

The `msgspec.Struct` combine the advantages of typing, runtime behaviour and
also offer the option of (fast) serializing (incl. type check) the objects.

Currently not possible but conceivable with `msgspec`: Outsourcing the engines
into separate processes, what possibilities this opens up in the future is left
to the imagination!

Internally, we have already defined that it is desirable to decouple the
development of the engines from the development of the SearXNG core / The
serialization of the `Result` objects is a prerequisite for this.

HINT: The threads listed above were the template for this PR, even though the
implementation here is based on msgspec.  They should also be an inspiration for
the following PRs of typification, as the models and implementations can provide
a good direction.

Why just one commit?
====================

I tried to create several (thematically separated) commits, but gave up at some
point ... there are too many things to tackle at once / The comprehensibility of
the commits would not be improved by a thematic separation. On the contrary, we
would have to make multiple changes at the same places and the goal of a change
would be vaguely recognizable in the fog of the commits.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-12-15 09:59:50 +01:00
+								        use_translation = sxng_request.form.get('use-translation')
-												searx.locales: improve support for languages not supported by babel

* refactor get_translations() to rely on ADDITIONAL_TRANSLATIONS and LOCALE_BEST_MATCH
* update RTL_LOCALES for languages in ADDITIONAL_TRANSLATIONS

											
										
										
											2022-11-04 16:47:02 +00:00
+								        if use_translation in ADDITIONAL_TRANSLATIONS:
 								            babel_ext = flask_babel.current_app.extensions['babel']
-												Bump flask-babel from 2.0.0 to 3.0.0

Bumps [flask-babel](https://github.com/python-babel/flask-babel) from 2.0.0 to 3.0.0.
- [Release notes](https://github.com/python-babel/flask-babel/releases)
- [Changelog](https://github.com/python-babel/flask-babel/blob/master/CHANGELOG)
- [Commits](https://github.com/python-babel/flask-babel/compare/v2.0.0...v3.0.0)

---
updated-dependencies:
- dependency-name: flask-babel
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

											
										
										
											2023-01-20 09:19:18 +00:00
+								            return Translations.load(babel_ext.translation_directories[0], use_translation)
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								    return _flask_babel_get_translations()
-												[mod] settings.yml: remove locales

There are detected from the searx/translations directory

											
										
										
											2021-08-03 15:13:00 +02:00
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								_TR_LOCALES: list[str] = []
-												[mod] pylint & document searx.locales (settings.yml: remove locales)

- Add ``# lint: pylint`` header to pylint this python file.
- Fix issues reported by pylint.
- Add source code documentation of modul searx.locales

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2021-08-03 18:17:23 +02:00
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								def get_translation_locales() -> list[str]:
-												[chore] *: fix typos detected by typos-cli

											
										
										
											2024-10-09 11:59:31 +02:00
+								    """Returns the list of translation locales (*underscore*).  The list is
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								    generated from the translation folders in :origin:`searx/translations`"""
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								    global _TR_LOCALES  # pylint:disable=global-statement
 								    if _TR_LOCALES:
 								        return _TR_LOCALES
-												[mod] settings.yml: remove locales

There are detected from the searx/translations directory

											
										
										
											2021-08-03 15:13:00 +02:00
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								    tr_locales = []
 								    for folder in (Path(searx_dir) / 'translations').iterdir():
 								        if not folder.is_dir():
 								            continue
 								        if not (folder / 'LC_MESSAGES').is_dir():
 								            continue
 								        tr_locales.append(folder.name)
 								    _TR_LOCALES = sorted(tr_locales)
 								    return _TR_LOCALES
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								def locales_initialize():
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								    """Initialize locales environment of the SearXNG session.
-												[doc] fix some leftovers from ad964562c

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-14 16:31:41 +02:00
+								    - monkey patch :py:obj:`flask_babel.get_translations` by :py:obj:`get_translations`
-												[fix] move locale code from webapp.py to locales.py and fix #1303

To improve modularization this patch:

- moves *locale* related implementation from the webapp.py application to the
  locale.py module.

- The initialization of the locales is now done in the application (webapp) and
  is no longer done while importing searx.locales.

In the searx.locales module a new dictionary named `LOCALE_BEST_MATCH` has been
added.  In this dictionary we can map languages without a translation to
languages we have a translation for.

To fix #1303 zh-HK has been mapped to zh-Hant-TW (we do not need additional
translations of traditional Chinese)

Closes: https://github.com/searxng/searxng/issues/1303
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-06-10 17:01:12 +02:00
+								    - init global names :py:obj:`LOCALE_NAMES`, :py:obj:`RTL_LOCALES`
 								    """
 								    flask_babel.get_translations = get_translations
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								    LOCALE_NAMES.update(data.LOCALES["LOCALE_NAMES"])
 								    RTL_LOCALES.update(data.LOCALES["RTL_LOCALES"])
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
-												[mod] replace engines_languages.json by engines_traits.json

Implementations of the *traits* of the engines.

Engine's traits are fetched from the origin engine and stored in a JSON file in
the *data folder*.  Most often traits are languages and region codes and their
mapping from SearXNG's representation to the representation in the origin search
engine.

To load traits from the persistence::

    searx.enginelib.traits.EngineTraitsMap.from_data()

For new traits new properties can be added to the class::

    searx.enginelib.traits.EngineTraits

.. hint::

   Implementation is downward compatible to the deprecated *supported_languages
   method* from the vintage implementation.

   The vintage code is tagged as *deprecated* an can be removed when all engines
   has been ported to the *traits method*.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-29 20:54:46 +02:00
+								def region_tag(locale: babel.Locale) -> str:
 								    """Returns SearXNG's region tag from the locale (e.g. zh-TW , en-US)."""
 								    if not locale.territory:
-												[fix] fetch_traits: brave, google, annas_archive & radio_browser

This patch fixes a bug reported by CI "Fetch traits" [1] (brave) and improves
other fetch traits functions (google, annas_archive & radio_browser).

brave:

    File "/home/runner/work/searxng/searxng/searx/engines/brave.py", line 434, in fetch_traits
      sxng_tag = region_tag(babel.Locale.parse(ui_lang, sep='-'))
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "/home/runner/work/searxng/searxng/searx/locales.py", line 155, in region_tag
    Error:     raise ValueError('%s missed a territory')

google:

  change ERROR message about unknow UI language to INFO message

radio_browser:

  country_list contains duplicates that differ only in upper/lower case

annas_archive:

  for better diff; sort the persistence of the traits

[1] https://github.com/searxng/searxng/actions/runs/10606312371/job/29433352518#step:6:41

Signed-off-by: Markus <markus@venom.fritz.box>

											
										
										
											2024-09-15 12:22:06 +02:00
+								        raise ValueError('babel.Locale %s: missed a territory' % locale)
-												[mod] replace engines_languages.json by engines_traits.json

Implementations of the *traits* of the engines.

Engine's traits are fetched from the origin engine and stored in a JSON file in
the *data folder*.  Most often traits are languages and region codes and their
mapping from SearXNG's representation to the representation in the origin search
engine.

To load traits from the persistence::

    searx.enginelib.traits.EngineTraitsMap.from_data()

For new traits new properties can be added to the class::

    searx.enginelib.traits.EngineTraits

.. hint::

   Implementation is downward compatible to the deprecated *supported_languages
   method* from the vintage implementation.

   The vintage code is tagged as *deprecated* an can be removed when all engines
   has been ported to the *traits method*.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-29 20:54:46 +02:00
+								    return locale.language + '-' + locale.territory
 								def language_tag(locale: babel.Locale) -> str:
 								    """Returns SearXNG's language tag from the locale and if exits, the tag
 								    includes the script name (e.g. en, zh_Hant).
 								    """
 								    sxng_lang = locale.language
 								    if locale.script:
 								        sxng_lang += '_' + locale.script
 								    return sxng_lang
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								def get_locale(locale_tag: str) -> babel.Locale | None:
-												[mod] replace utils.match_language by locales.match_locale

This patch replaces the *full of magic* ``utils.match_language`` function by a
``locales.match_locale``.  The ``locales.match_locale`` function is based on the
``locales.build_engine_locales`` introduced in 9ae409a0 [1].

In the past SearXNG did only support a search by a language but not in a region.
This has been changed a long time ago and regions have been added to SearXNG
core but not to the engines.  The ``utils.match_language`` was the function to
handle the different aspects of language/regions in SearXNG core and the
supported *languages* in the engine.  The ``utils.match_language`` did it with
some magic and works good for most use cases but fails in some edge case.

To replace the concurrence of languages and regions in the SearXNG core the
``locales.build_engine_locales`` was introduced in 9ae409a0 [1].  With the last
patches all engines has been migrated to a ``fetch_traits`` and a
language/region concept that is based on ``locales.build_engine_locales``.

To summarize: there is no longer a need for the ``locales.match_language``.

[1] https://github.com/searxng/searxng/pull/1652

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2023-02-07 14:11:58 +01:00
+								    """Returns a :py:obj:`babel.Locale` object parsed from argument
 								    ``locale_tag``"""
 								    try:
 								        locale = babel.Locale.parse(locale_tag, sep='-')
 								        return locale
 								    except babel.core.UnknownLocaleError:
 								        return None
-												[fix] spelling

											
										
										
											2023-09-15 00:53:03 -07:00
+								def get_official_locales(
-												[mod] replace engines_languages.json by engines_traits.json

Implementations of the *traits* of the engines.

Engine's traits are fetched from the origin engine and stored in a JSON file in
the *data folder*.  Most often traits are languages and region codes and their
mapping from SearXNG's representation to the representation in the origin search
engine.

To load traits from the persistence::

    searx.enginelib.traits.EngineTraitsMap.from_data()

For new traits new properties can be added to the class::

    searx.enginelib.traits.EngineTraits

.. hint::

   Implementation is downward compatible to the deprecated *supported_languages
   method* from the vintage implementation.

   The vintage code is tagged as *deprecated* an can be removed when all engines
   has been ported to the *traits method*.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-29 20:54:46 +02:00
+								    territory: str, languages=None, regional: bool = False, de_facto: bool = True
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								) -> set[babel.Locale]:
-												[mod] replace engines_languages.json by engines_traits.json

Implementations of the *traits* of the engines.

Engine's traits are fetched from the origin engine and stored in a JSON file in
the *data folder*.  Most often traits are languages and region codes and their
mapping from SearXNG's representation to the representation in the origin search
engine.

To load traits from the persistence::

    searx.enginelib.traits.EngineTraitsMap.from_data()

For new traits new properties can be added to the class::

    searx.enginelib.traits.EngineTraits

.. hint::

   Implementation is downward compatible to the deprecated *supported_languages
   method* from the vintage implementation.

   The vintage code is tagged as *deprecated* an can be removed when all engines
   has been ported to the *traits method*.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-29 20:54:46 +02:00
+								    """Returns a list of :py:obj:`babel.Locale` with languages from
 								    :py:obj:`babel.languages.get_official_languages`.
 								    :param territory: The territory (country or region) code.
 								    :param languages: A list of language codes the languages from
 								      :py:obj:`babel.languages.get_official_languages` should be in
 								      (intersection).  If this argument is ``None``, all official languages in
 								      this territory are used.
 								    :param regional: If the regional flag is set, then languages which are
 								      regionally official are also returned.
 								    :param de_facto: If the de_facto flag is set to `False`, then languages
 								      which are “de facto” official are not returned.
 								    """
 								    ret_val = set()
 								    o_languages = babel.languages.get_official_languages(territory, regional=regional, de_facto=de_facto)
 								    if languages:
 								        languages = [l.lower() for l in languages]
 								        o_languages = set(l for l in o_languages if l.lower() in languages)
 								    for lang in o_languages:
 								        try:
 								            locale = babel.Locale.parse(lang + '_' + territory)
 								            ret_val.add(locale)
 								        except babel.UnknownLocaleError:
 								            continue
 								    return ret_val
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								def get_engine_locale(searxng_locale, engine_locales, default=None):
 								    """Return engine's language (aka locale) string that best fits to argument
 								    ``searxng_locale``.
 								    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
-												[fix] and improve docs generated from source code.

Fix::

    searx/locales.py:docstring of searx.locales.get_engine_locale:17: \
      WARNING: Definition list ends without a blank line; unexpected unindent.

Improvement: don't show default values in the generated documentation whe it is
more a mess than a usefull information (`:meta hide-value:`).

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-18 12:44:12 +02:00
+								    corresponding *engine locales*::
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
 								      <engine>: {
 								          # SearXNG string : engine-string
 								          'ca-ES'          : 'ca_ES',
 								          'fr-BE'          : 'fr_BE',
 								          'fr-CA'          : 'fr_CA',
 								          'fr-CH'          : 'fr_CH',
 								          'fr'             : 'fr_FR',
 								          ...
 								          'pl-PL'          : 'pl_PL',
 								          'pt-PT'          : 'pt_PT'
-												[mod] replace engines_languages.json by engines_traits.json

Implementations of the *traits* of the engines.

Engine's traits are fetched from the origin engine and stored in a JSON file in
the *data folder*.  Most often traits are languages and region codes and their
mapping from SearXNG's representation to the representation in the origin search
engine.

To load traits from the persistence::

    searx.enginelib.traits.EngineTraitsMap.from_data()

For new traits new properties can be added to the class::

    searx.enginelib.traits.EngineTraits

.. hint::

   Implementation is downward compatible to the deprecated *supported_languages
   method* from the vintage implementation.

   The vintage code is tagged as *deprecated* an can be removed when all engines
   has been ported to the *traits method*.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-29 20:54:46 +02:00
+								          ..
 								          'zh'             : 'zh'
 								          'zh_Hans'        : 'zh'
-												[fix] doc of locales.get_engine_locale() / zh-classical is missleading

Wikipedia's zh-classical is not zh_Hant (see doc-string of engines.wikipedia).
Fixed the example in the doc-string of locales.get_engine_locale() to 'zh_TW'.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2023-04-17 08:40:44 +02:00
+								          'zh_Hant'        : 'zh_TW'
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								      }
 								    .. hint::
 								       The *SearXNG locale* string has to be known by babel!
 								    If there is no direct 1:1 mapping, this functions tries to narrow down
 								    engine's language (locale).  If no value can be determined by these
 								    approximation attempts the ``default`` value is returned.
 								    Assumptions:
 								    A. When user select a language the results should be optimized according to
 								       the selected language.
 								    B. When user select a language and a territory the results should be
-												[fix] spelling

											
										
										
											2023-09-15 00:53:03 -07:00
+								       optimized with first priority on territory and second on language.
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
 								    First approximation rule (*by territory*):
-												[fix] spelling

											
										
										
											2023-09-15 00:53:03 -07:00
+								      When the user selects a locale with territory (and a language), the
 								      territory has priority over the language.  If any of the official languages
 								      in the territory is supported by the engine (``engine_locales``) it will
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								      be used.
 								    Second approximation rule (*by language*):
 								      If "First approximation rule" brings no result or the user selects only a
-												[fix] spelling

											
										
										
											2023-09-15 00:53:03 -07:00
+								      language without a territory.  Check in which territories the language
 								      has an official status and if one of these territories is supported by the
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								      engine.
 								    """
-												[mod] replace engines_languages.json by engines_traits.json

Implementations of the *traits* of the engines.

Engine's traits are fetched from the origin engine and stored in a JSON file in
the *data folder*.  Most often traits are languages and region codes and their
mapping from SearXNG's representation to the representation in the origin search
engine.

To load traits from the persistence::

    searx.enginelib.traits.EngineTraitsMap.from_data()

For new traits new properties can be added to the class::

    searx.enginelib.traits.EngineTraits

.. hint::

   Implementation is downward compatible to the deprecated *supported_languages
   method* from the vintage implementation.

   The vintage code is tagged as *deprecated* an can be removed when all engines
   has been ported to the *traits method*.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-29 20:54:46 +02:00
+								    # pylint: disable=too-many-branches, too-many-return-statements
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
 								    engine_locale = engine_locales.get(searxng_locale)
 								    if engine_locale is not None:
-												[mod] replace engines_languages.json by engines_traits.json

Implementations of the *traits* of the engines.

Engine's traits are fetched from the origin engine and stored in a JSON file in
the *data folder*.  Most often traits are languages and region codes and their
mapping from SearXNG's representation to the representation in the origin search
engine.

To load traits from the persistence::

    searx.enginelib.traits.EngineTraitsMap.from_data()

For new traits new properties can be added to the class::

    searx.enginelib.traits.EngineTraits

.. hint::

   Implementation is downward compatible to the deprecated *supported_languages
   method* from the vintage implementation.

   The vintage code is tagged as *deprecated* an can be removed when all engines
   has been ported to the *traits method*.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-29 20:54:46 +02:00
+								        # There was a 1:1 mapping (e.g. a region "fr-BE --> fr_BE" or a language
 								        # "zh --> zh"), no need to narrow language-script nor territory.
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								        return engine_locale
-												[fix] harden get_engine_locale: handle UnknownLocaleError exceptions

When a user selects an unknown or invalid locale by using the search syntax:

    !qw siemens :de-TW

Before this patch a UnknownLocaleError exception will be rasied:

```
Traceback (most recent call last):
  File "SearXNG/searx/search/processors/online.py", line 154, in search
    search_results = self._search_basic(query, params)
  File "SearXNG/searx/search/processors/online.py", line 128, in _search_basic
    self.engine.request(query, params)
  File "SearXNG/searx/engines/qwant.py", line 98, in request
    q_locale = get_engine_locale(params['language'], supported_languages, default='en_US')
  File "SearXNG/searx/locales.py", line 216, in get_engine_locale
    locale = babel.Locale.parse(searxng_locale, sep='-')
  File "SearXNG/local/py3/lib/python3.8/site-packages/babel/core.py", line 330, in parse
    raise UnknownLocaleError(input_id)
```

This patch implements a simple exception handling, since e.g. `de-TW` does not
exists `de` will be used to get engines locale.  On invalid terms like `xy-XY`
the default will be returned.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-14 13:38:50 +02:00
+								    try:
 								        locale = babel.Locale.parse(searxng_locale, sep='-')
 								    except babel.core.UnknownLocaleError:
 								        try:
-												[fix] typo in get_engine_locale

Due to a typo in get_engine_locale, a language selection like `!qw :de siemens`
did not work.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-14 14:35:09 +02:00
+								            locale = babel.Locale.parse(searxng_locale.split('-')[0])
-												[fix] harden get_engine_locale: handle UnknownLocaleError exceptions

When a user selects an unknown or invalid locale by using the search syntax:

    !qw siemens :de-TW

Before this patch a UnknownLocaleError exception will be rasied:

```
Traceback (most recent call last):
  File "SearXNG/searx/search/processors/online.py", line 154, in search
    search_results = self._search_basic(query, params)
  File "SearXNG/searx/search/processors/online.py", line 128, in _search_basic
    self.engine.request(query, params)
  File "SearXNG/searx/engines/qwant.py", line 98, in request
    q_locale = get_engine_locale(params['language'], supported_languages, default='en_US')
  File "SearXNG/searx/locales.py", line 216, in get_engine_locale
    locale = babel.Locale.parse(searxng_locale, sep='-')
  File "SearXNG/local/py3/lib/python3.8/site-packages/babel/core.py", line 330, in parse
    raise UnknownLocaleError(input_id)
```

This patch implements a simple exception handling, since e.g. `de-TW` does not
exists `de` will be used to get engines locale.  On invalid terms like `xy-XY`
the default will be returned.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-14 13:38:50 +02:00
+								        except babel.core.UnknownLocaleError:
 								            return default
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
-												[mod] replace engines_languages.json by engines_traits.json

Implementations of the *traits* of the engines.

Engine's traits are fetched from the origin engine and stored in a JSON file in
the *data folder*.  Most often traits are languages and region codes and their
mapping from SearXNG's representation to the representation in the origin search
engine.

To load traits from the persistence::

    searx.enginelib.traits.EngineTraitsMap.from_data()

For new traits new properties can be added to the class::

    searx.enginelib.traits.EngineTraits

.. hint::

   Implementation is downward compatible to the deprecated *supported_languages
   method* from the vintage implementation.

   The vintage code is tagged as *deprecated* an can be removed when all engines
   has been ported to the *traits method*.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-09-29 20:54:46 +02:00
+								    searxng_lang = language_tag(locale)
 								    engine_locale = engine_locales.get(searxng_lang)
 								    if engine_locale is not None:
 								        # There was a 1:1 mapping (e.g. "zh-HK --> zh_Hant" or "zh-CN --> zh_Hans")
 								        return engine_locale
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								    # SearXNG's selected locale is not supported by the engine ..
 								    if locale.territory:
-												[fix] spelling

											
										
										
											2023-09-15 00:53:03 -07:00
+								        # Try to narrow by *official* languages in the territory (??-XX).
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
 								        for official_language in babel.languages.get_official_languages(locale.territory, de_facto=True):
 								            searxng_locale = official_language + '-' + locale.territory
 								            engine_locale = engine_locales.get(searxng_locale)
 								            if engine_locale is not None:
 								                return engine_locale
-												[fix] spelling

											
										
										
											2023-09-15 00:53:03 -07:00
+								    # Engine does not support one of the official languages in the territory or
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								    # there is only a language selected without a territory.
 								    # Now lets have a look if the searxng_lang (the language selected by the
-												[fix] spelling

											
										
										
											2023-09-15 00:53:03 -07:00
+								    # user) is a official language in other territories.  If so, check if
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								    # engine does support the searxng_lang in this other territory.
 								    if locale.language:
 								        terr_lang_dict = {}
 								        for territory, langs in babel.core.get_global("territory_languages").items():
 								            if not langs.get(searxng_lang, {}).get('official_status'):
 								                continue
 								            terr_lang_dict[territory] = langs.get(searxng_lang)
 								        # first: check fr-FR, de-DE .. is supported by the engine
-												[fix] get_engine_locale: better approximation of 'en' is 'en-US'

Compared to `en-EN` the better approximation of 'en' is 'en-US'.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-14 15:45:07 +02:00
+								        # exception: 'en' --> 'en-US'
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
 								        territory = locale.language.upper()
-												[fix] get_engine_locale: better approximation of 'en' is 'en-US'

Compared to `en-EN` the better approximation of 'en' is 'en-US'.

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-14 15:45:07 +02:00
+								        if territory == 'EN':
 								            territory = 'US'
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								        if terr_lang_dict.get(territory):
 								            searxng_locale = locale.language + '-' + territory
 								            engine_locale = engine_locales.get(searxng_locale)
 								            if engine_locale is not None:
 								                return engine_locale
 								        # second: sort by population_percent and take first match
-												[fix] spelling

											
										
										
											2023-09-15 00:53:03 -07:00
+								        # drawback of "population percent": if there is a territory with a
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								        #   small number of people (e.g 100) but the majority speaks the
-												[fix] spelling

											
										
										
											2023-09-15 00:53:03 -07:00
+								        #   language, then the percentage might be 100% (--> 100 people) but in
 								        #   a different territory with more people (e.g. 10.000) where only 10%
-												[mod] add locale.get_engine_locale to get predictable results

The match_language function sometimes returns incorrect results which is why a
new function get_engine_locale is required.

A bugfix of the match_language is not easily possible, because there is almost
no documentation for it and already the call parameters are undefined.  E.g. the
function processes values like the ones from yahoo::

    "yahoo": [
        "ar",
        ...
        "zh_chs",
        "zh_cht"
     ]

The get_engine_locale has been documented in detail, there is a clear
description of the assumptions as well as the requirements and approximation
rules (read doc-string for more details)::

    Argument ``engine_locales`` is a python dict that maps *SearXNG locales* to
    corresponding *engine locales*:

      <engine>: {
          # SearXNG string : engine-string
          'ca-ES'          : 'ca_ES',
          'fr-BE'          : 'fr_BE',
          'fr-CA'          : 'fr_CA',
          'fr-CH'          : 'fr_CH',
          'fr'             : 'fr_FR',
          ...
          'pl-PL'          : 'pl_PL',
          'pt-PT'          : 'pt_PT'
      }

    .. hint::

       The *SearXNG locale* string has to be known by babel!

In the following you will find a comparison:

>>> import babel.languages
>>> from searx.utils import match_language
>>> from searx.locales import get_engine_locale

Assume we have an engine that supports the follwoing locales:

>>> lang_list = {
...     "zh-CN": "zh_CN",
...     "zh-HK": "zh_HK",
...     "nl-BE": "nl_BE",
...     "fr-CA": "fr_CA",
... }

Assumption:

  A. When a user selects a language the results should be optimized according to
     the selected language.

  B. When user selects a language and a territory the results should be
     optimized with first priority on territory and second on language.

----

Example: (Assumption A.)

  A user selects region 'zh-TW' which should end in zh_HK

hint:
  CN is 'Hans' and HK ('Hant') fits better to TW ('Hant')

>>> get_engine_locale('zh-TW', lang_list)
'zh_HK'
>>> lang_list[match_language('zh-TW', lang_list)]
'zh_CN'

----

Example: (Assumption A.)

  A user selects only the language 'zh' which should end in CN

>>> get_engine_locale('zh', lang_list)
'zh_CN'
>>> lang_list[match_language('zh', lang_list)]
'zh_CN'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

hint:
  priority should be on the territory the user selected.  If the user
  prefers 'fr' he will select 'fr' without a region tag.

>>> get_engine_locale('fr-BE', lang_list, default='unknown')
'nl_BE'
>>> match_language('fr-BE', lang_list, fallback='unknown')
'fr-CA'

----

Example: (Assumption A.)

  A user selects only the language 'fr' which should end in fr_CA

>>> get_engine_locale('fr', lang_list)
'fr_CA'
>>> lang_list[match_language('fr', lang_list)]
'fr_CA'

----

The difference in priority on the territory is best shown with a engine that
supports the following locales:

>>> lang_list = {
...     "fr-FR": "fr_FR",
...     "fr-CA": "fr_CA",
...     "en-GB": "en_GB",
...     "nl-BE": "nl_BE",
... }

----

Example: (Assumption A.)

   A user selects only a language

>>> get_engine_locale('en', lang_list)
'en_GB'
>>> match_language('en', lang_list)
'en-GB'

hint: the engine supports fr_FR and fr_CA since no territory is given, fr_FR
takes priority ..

>>> get_engine_locale('fr', lang_list)
'fr_FR'
>>> lang_list[match_language('fr', lang_list)]
'fr_FR'

----

Example: (Assumption B.)

  A user selects region 'fr-BE' which should end in nl-BE

>>> get_engine_locale('fr-BE', lang_list)
'nl_BE'
>>> lang_list[match_language('fr-BE', lang_list)]
'fr_FR'

----

If the user selects a language and there are two locales like the following:

>>> lang_list = {
...      "fr-BE": "fr_BE",
...      "fr-CH": "fr_CH",
...  }
>>>

>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_BE'

Looks like both functions return the same value, but match_language depends on the
order of the dictionary (which is not predictable):

>>> lang_list = {
...      "fr-CH": "fr_CH",
...      "fr-BE": "fr_BE",
...  }
>>> get_engine_locale('fr', lang_list)
'fr_BE'
>>> lang_list[match_language('fr', lang_list)]
'fr_CH'
>>>

The get_engine_locale selects the locale by looking at the "population percent"
and this percentage has an higher amount in BE (68.%) compared to CH (21%)

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2022-08-12 17:46:20 +02:00
+								        #   speak the language the total amount of speaker is higher (--> 200
 								        #   people).
 								        #
 								        #   By example: The population of Saint-Martin is 33.000, of which 100%
 								        #   speak French, but this is less than the 30% of the approximately 2.5
 								        #   million Belgian citizens
 								        #
 								        #   - 'fr-MF', 'population_percent': 100.0, 'official_status': 'official'
 								        #   - 'fr-BE', 'population_percent': 38.0, 'official_status': 'official'
 								        terr_lang_list = []
 								        for k, v in terr_lang_dict.items():
 								            terr_lang_list.append((k, v))
 								        for territory, _lang in sorted(terr_lang_list, key=lambda item: item[1]['population_percent'], reverse=True):
 								            searxng_locale = locale.language + '-' + territory
 								            engine_locale = engine_locales.get(searxng_locale)
 								            if engine_locale is not None:
 								                return engine_locale
 								    # No luck: narrow by "language from territory" and "territory from language"
 								    # does not fit to a locale supported by the engine.
 								    if engine_locale is None:
 								        engine_locale = default
 								    return default
-												[mod] replace utils.match_language by locales.match_locale

This patch replaces the *full of magic* ``utils.match_language`` function by a
``locales.match_locale``.  The ``locales.match_locale`` function is based on the
``locales.build_engine_locales`` introduced in 9ae409a0 [1].

In the past SearXNG did only support a search by a language but not in a region.
This has been changed a long time ago and regions have been added to SearXNG
core but not to the engines.  The ``utils.match_language`` was the function to
handle the different aspects of language/regions in SearXNG core and the
supported *languages* in the engine.  The ``utils.match_language`` did it with
some magic and works good for most use cases but fails in some edge case.

To replace the concurrence of languages and regions in the SearXNG core the
``locales.build_engine_locales`` was introduced in 9ae409a0 [1].  With the last
patches all engines has been migrated to a ``fetch_traits`` and a
language/region concept that is based on ``locales.build_engine_locales``.

To summarize: there is no longer a need for the ``locales.match_language``.

[1] https://github.com/searxng/searxng/pull/1652

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2023-02-07 14:11:58 +01:00
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								def match_locale(searxng_locale: str, locale_tag_list: list[str], fallback: str | None = None) -> str | None:
-												[mod] replace utils.match_language by locales.match_locale

This patch replaces the *full of magic* ``utils.match_language`` function by a
``locales.match_locale``.  The ``locales.match_locale`` function is based on the
``locales.build_engine_locales`` introduced in 9ae409a0 [1].

In the past SearXNG did only support a search by a language but not in a region.
This has been changed a long time ago and regions have been added to SearXNG
core but not to the engines.  The ``utils.match_language`` was the function to
handle the different aspects of language/regions in SearXNG core and the
supported *languages* in the engine.  The ``utils.match_language`` did it with
some magic and works good for most use cases but fails in some edge case.

To replace the concurrence of languages and regions in the SearXNG core the
``locales.build_engine_locales`` was introduced in 9ae409a0 [1].  With the last
patches all engines has been migrated to a ``fetch_traits`` and a
language/region concept that is based on ``locales.build_engine_locales``.

To summarize: there is no longer a need for the ``locales.match_language``.

[1] https://github.com/searxng/searxng/pull/1652

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2023-02-07 14:11:58 +01:00
+								    """Return tag from ``locale_tag_list`` that best fits to ``searxng_locale``.
 								    :param str searxng_locale: SearXNG's internal representation of locale (de,
 								        de-DE, fr-BE, zh, zh-CN, zh-TW ..).
 								    :param list locale_tag_list: The list of locale tags to select from
 								    :param str fallback: fallback locale tag (if unset --> ``None``)
 								    The rules to find a match are implemented in :py:obj:`get_engine_locale`,
 								    the ``engine_locales`` is build up by :py:obj:`build_engine_locales`.
 								    .. hint::
 								       The *SearXNG locale* string and the members of ``locale_tag_list`` has to
 								       be known by babel!  The :py:obj:`ADDITIONAL_TRANSLATIONS` are used in the
 								       UI and are not known by babel --> will be ignored.
 								    """
 								    # searxng_locale = 'es'
 								    # locale_tag_list = ['es-AR', 'es-ES', 'es-MX']
 								    if not searxng_locale:
 								        return fallback
 								    locale = get_locale(searxng_locale)
 								    if locale is None:
 								        return fallback
 								    # normalize to a SearXNG locale that can be passed to get_engine_locale
 								    searxng_locale = language_tag(locale)
 								    if locale.territory:
 								        searxng_locale = region_tag(locale)
 								    # clean up locale_tag_list
 								    tag_list = []
 								    for tag in locale_tag_list:
 								        if tag in ('all', 'auto') or tag in ADDITIONAL_TRANSLATIONS:
 								            continue
 								        tag_list.append(tag)
 								    # emulate fetch_traits
 								    engine_locales = build_engine_locales(tag_list)
 								    return get_engine_locale(searxng_locale, engine_locales, default=fallback)
-												[mod] reduce memory footprint by not calling babel.Locale.parse at runtime

babel.Locale.parse loads more than 60MB in RAM.  The only purpose is to get:

    LOCALE_NAMES   - searx.data.LOCALES["LOCALE_NAMES"]
    RTL_LOCALES    - searx.data.LOCALES["RTL_LOCALES"]

This commit calls babel.Locale.parse when the translations are update from
weblate and stored in::

    searx/data/locales.json

This file can be build by::

    ./manage data.locales

By store these variables in searx.data when the translations are updated we save
round about 65MB (usually 4 worker = 260MB of RAM saved.

Suggested-by: https://github.com/searxng/searxng/discussions/2633#discussioncomment-8490494
Co-authored-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2024-02-16 20:46:18 +00:00
+								def build_engine_locales(tag_list: list[str]):
-												[mod] replace utils.match_language by locales.match_locale

This patch replaces the *full of magic* ``utils.match_language`` function by a
``locales.match_locale``.  The ``locales.match_locale`` function is based on the
``locales.build_engine_locales`` introduced in 9ae409a0 [1].

In the past SearXNG did only support a search by a language but not in a region.
This has been changed a long time ago and regions have been added to SearXNG
core but not to the engines.  The ``utils.match_language`` was the function to
handle the different aspects of language/regions in SearXNG core and the
supported *languages* in the engine.  The ``utils.match_language`` did it with
some magic and works good for most use cases but fails in some edge case.

To replace the concurrence of languages and regions in the SearXNG core the
``locales.build_engine_locales`` was introduced in 9ae409a0 [1].  With the last
patches all engines has been migrated to a ``fetch_traits`` and a
language/region concept that is based on ``locales.build_engine_locales``.

To summarize: there is no longer a need for the ``locales.match_language``.

[1] https://github.com/searxng/searxng/pull/1652

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2023-02-07 14:11:58 +01:00
+								    """From a list of locale tags a dictionary is build that can be passed by
 								    argument ``engine_locales`` to :py:obj:`get_engine_locale`.  This function
 								    is mainly used by :py:obj:`match_locale` and is similar to what the
 								    ``fetch_traits(..)`` function of engines do.
 								    If there are territory codes in the ``tag_list`` that have a *script code*
 								    additional keys are added to the returned dictionary.
 								    .. code:: python
 								       >>> import locales
 								       >>> engine_locales = locales.build_engine_locales(['en', 'en-US', 'zh', 'zh-CN', 'zh-TW'])
 								       >>> engine_locales
 								       {
 								           'en': 'en', 'en-US': 'en-US',
 								           'zh': 'zh', 'zh-CN': 'zh-CN', 'zh_Hans': 'zh-CN',
 								           'zh-TW': 'zh-TW', 'zh_Hant': 'zh-TW'
 								       }
 								       >>> get_engine_locale('zh-Hans', engine_locales)
 								       'zh-CN'
 								    This function is a good example to understand the language/region model
 								    of SearXNG:
 								      SearXNG only distinguishes between **search languages** and **search
 								      regions**, by adding the *script-tags*, languages with *script-tags* can
 								      be assigned to the **regions** that SearXNG supports.
 								    """
 								    engine_locales = {}
 								    for tag in tag_list:
 								        locale = get_locale(tag)
 								        if locale is None:
-												use logger.warning

logger.warn() is depricated.
logger.warning is already being used in some files.

											
										
										
											2023-05-19 19:35:29 +05:30
+								            logger.warning("build_engine_locales: skip locale tag %s / unknown by babel", tag)
-												[mod] replace utils.match_language by locales.match_locale

This patch replaces the *full of magic* ``utils.match_language`` function by a
``locales.match_locale``.  The ``locales.match_locale`` function is based on the
``locales.build_engine_locales`` introduced in 9ae409a0 [1].

In the past SearXNG did only support a search by a language but not in a region.
This has been changed a long time ago and regions have been added to SearXNG
core but not to the engines.  The ``utils.match_language`` was the function to
handle the different aspects of language/regions in SearXNG core and the
supported *languages* in the engine.  The ``utils.match_language`` did it with
some magic and works good for most use cases but fails in some edge case.

To replace the concurrence of languages and regions in the SearXNG core the
``locales.build_engine_locales`` was introduced in 9ae409a0 [1].  With the last
patches all engines has been migrated to a ``fetch_traits`` and a
language/region concept that is based on ``locales.build_engine_locales``.

To summarize: there is no longer a need for the ``locales.match_language``.

[1] https://github.com/searxng/searxng/pull/1652

Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>

											
										
										
											2023-02-07 14:11:58 +01:00
+								            continue
 								        if locale.territory:
 								            engine_locales[region_tag(locale)] = tag
 								            if locale.script:
 								                engine_locales[language_tag(locale)] = tag
 								        else:
 								            engine_locales[language_tag(locale)] = tag
 								    return engine_locales