Source code for asr_eval.correction.bow_corpus

import re
from typing import Literal

from tqdm.auto import tqdm

from asr_eval.linguistics.linguistics import word_freq, lemmatize_ru


__all__ = [
    'prepare_domain_specific_bag_of_words_corpus',
]


[docs] def prepare_domain_specific_bag_of_words_corpus( corpus: str, pattern: str = r'\w+', lemmatize: Literal['add', 'replace', 'no'] = 'add', wordfreq_threshold: float | None = 2, wordfreq_lang: str = 'ru', pbar: bool = False, ) -> set[str]: """ Extracts words from domain specific corpus or dictionary. For each word adds/replaces with lemmatized form. Filters out too frequent words based on wordfreq_threshold. """ words: set[str] = set(re.findall(pattern, corpus.lower())) words |= {word.replace('ё', 'е') for word in words if 'ё' in word} if lemmatize != 'no': lemmatized_words = { lemmatize_ru(word) for word in tqdm(words, disable=not pbar, desc='lematizing') } match lemmatize: case 'add': words |= lemmatized_words case 'replace': words = lemmatized_words if wordfreq_threshold is not None: words = { word for word in tqdm(words, disable=not pbar, desc='wordfreq') if word_freq(word, lang=wordfreq_lang) < wordfreq_threshold } return words