Source code for bp_text.language

"""
This module contains functionality dealing with languages (e.g. detection).

Created: 2025-03-27
Author: Ruben Philipp <me@rubenphilipp.com>

$$ Last modified:  15:36:10 Mon Apr 28 2025 CEST
"""

from lingua import Language, LanguageDetectorBuilder
import langcodes

################################################################################

## globals
## the default languages for detection
#: The default languages used by the detection algorithm.
DEFAULT_LANGUAGES = [Language.ENGLISH, Language.GERMAN, Language.FRENCH,
                     Language.SPANISH, Language.ITALIAN, Language.DUTCH,
                     Language.PORTUGUESE, Language.POLISH, Language.SWEDISH]

[docs] class LanguageDetector: """A language detector. :param languages: A list containing all languages (cf. `lingua.Language`) to consider. Default = :any:`DEFAULT_LANGUAGES` :type languages: list of `lingua.Language` objects. """
[docs] def __init__(self, languages = DEFAULT_LANGUAGES): self._languages = languages self._detector = None ## init self.update()
@property def languages(self): return self._languages @languages.setter def languages(self, langList): if isinstance(langList, list) \ and all( isinstance(elem, Language) for ele in langList ): self._languages = langList else: print("Error: languages must be a list of Language objects.") return False ## update detector self.update() @property def detector(self): """This is the actual detector object to use for detection. Example:: detector = language.LanguageDetector().detector detector.detect_language_of("Hallo Welt") """ return self._detector ########################################
[docs] def update(self): """Update the instance. """ ## instantiate detector self._detector = LanguageDetectorBuilder \ .from_languages(*self._languages).build()
################################################################################ ################################################################################ # Table/dict to convert a BibLaTeX `langid` to a ISO-639-1 code. biblatex_langid_to_iso = { "basque": "eu", "bulgarian": "bg", "catalan": "ca", "croatian": "hr", "czech": "cs", "danish": "da", "dutch": "nl", "american": "en", "USenglish": "en", "english": "en", "british": "en", "UKenglish": "en", "canadian": "en", "australian": "en", "newzealand": "en", "estonian": "et", "finnish": "fi", "french": "fr", "german": "de", "austrian": "de", "swissgerman": "de", "ngerman": "de", "naustrian": "de", "nswissgerman": "de", "greek": "el", "magyar": "hu", "hungarian": "hu", "icelandic": "is", "italian": "it", "latvian": "lv", "lithuanian": "lt", "marathi": "mr", "norsk": "no", "nynorsk": "nn", "polish": "pl", "brazil": "pt", "portuguese": "pt", "portuges": "pt", "romanian": "ro", "russian": "ru", "serbian": "sr", "serbianc": "sr", "slovak": "sk", "slovene": "sl", "slovenian": "sl", "spanish": "es", "swedish": "sv", "turkish": "tr", "ukrainian": "uk" } # Table/dict to convert an ISO-639-1 code to a BibLaTeX langid. iso_to_biblatex_langid = { 'eu': 'basque', 'bg': 'bulgarian', 'ca': 'catalan', 'hr': 'croatian', 'cs': 'czech', 'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'ngerman', 'el': 'greek', 'hu': 'hungarian', 'is': 'icelandic', 'it': 'italian', 'lv': 'latvian', 'lt': 'lithuanian', 'mr': 'marathi', 'no': 'norsk', 'nn': 'nynorsk', 'pl': 'polish', 'pt': 'portuguese', 'ro': 'romanian', 'ru': 'russian', 'sr': 'serbian', 'sk': 'slovak', 'sl': 'slovenian', 'es': 'spanish', 'sv': 'swedish', 'tr': 'turkish', 'uk': 'ukrainian'}
[docs] def langid_to_iso_639_1(langid, fallback="en"): """Standardize (i.e. convert to ISO-639-1) a Bib(La)TeX `langid` (see BibLaTeX documentation for a list of supported langids, https://ctan.org/pkg/biblatex). If no matches are found, it falls back to the given fallback variable and prints a notice. If no fallback is specified, it returns None and prints a notice. Example:: langid_to_iso_639_1("ngerman") # => 'de' :param langid: The langid to standardize/parse. :type langid: string :param fallback: Any string (preferably an ISO-639-1 code) the function should fall back to in the case no matching translation is found. Default = "en" :type fallback: string """ result = biblatex_langid_to_iso.get(langid) if result: return result elif fallback: print(f"langid '{langid}' not found. Falling back to '{fallback}'.") return fallback else: print(f"langid '{langid}' not found. No fallback specified.") return None
[docs] def iso_639_1_to_langid(iso_code, fallback="english"): """The inverse of :py:func:`langid_to_iso_639_1`. Converts an ISO-639-1 code to a BibLaTeX `langid`. :param iso_code: The ISO-639-1 code (e.g. "de") to convert. :type iso_code: string :param fallback: The fallback language to return if no translation found in the translation table. Should be a valid BibLaTeX langid (cf. BibLaTeX documentation). Default = "english" :type fallback: string """ result = iso_to_biblatex_langid.get(iso_code) if result: return result elif fallback: print(f"no langid found for iso_code '{iso_code}'. " + f"Falling back to '{fallback}'.") return fallback else: print(f"no langid found for iso_code '{iso_code}'. " + "No fallback specified.") return None
[docs] def convert_langcode(langcode, inFormat="langid", outFormat="iso_code_639_1"): """Convert a language code to a different form. This could for example be used to translate a Bib(La)TeX `langid` to an ISO-639-3 code (or vice versa). Possible in- and out-formats are: * `langid`: a Bib(La)TeX `langid` (e.g. `ngerman` or `english`, cf. BibLaTeX documentation at https://ctan.org/pkg/biblatex) * `iso_code_639_1`: an ISO-639-1 code (e.g. `DE` or `EN`) * `iso_code_639_3`: an ISO-639-3 code (e.g. `DEU` or `ENG`) Example:: convert_langcode("ngerman") # => 'de' convert_langcode("en", "iso_code_639_1", "langid") # => 'english' :param langcode: The language code to translate. :type langcode: string :param inFormat: The input format (i.e. of `langcode`). Must be one of the possible in/out formats (see above). Default = "langid" :type inFormat: string :param outFormat: The output format (i.e. of the return value). Must be one of the possible in/out formats (see above). Default = "iso_code_639_3" :type outFormat: string :return: The converted langcode in the `outFormat`. :rtype: string """ if inFormat == outFormat: print("convert_langcode: inFormat is the same as outFormat. Ignoring.") if inFormat == "langid": iso1 = langid_to_iso_639_1(langcode) if outFormat == "iso_code_639_3": return langcodes.Language.get(iso1).to_alpha3() elif outFormat == "iso_code_639_1": return iso1 else: print(f"convert_langcode: Invalid outFormat '{outFormat}'.") return None elif inFormat == "iso_code_639_1": if outFormat == "iso_code_639_3": return langcodes.Language.get(langcode).to_alpha3() elif outFormat == "langid": return iso_639_1_to_langid(langcode) else: print(f"convert_langcode: Invalid outFormat '{outFormat}'.") return None elif inFormat == "iso_code_639_3": iso1 = langcodes.standardize_tag(langcode) if outFormat == "iso_code_639_1": return iso1 elif outFormat == "langid": return iso_639_1_to_langid(iso1) else: print(f"convert_langcode: Invalid outFormat '{outFormat}'.") return None
################################################################################ ## EOF language.py