Source code for asr_eval.correction.bow_corpus

import re
from typing import Literal

from tqdm.auto import tqdm

from asr_eval.linguistics.linguistics import word_freq, lemmatize_ru


__all__ = [
    'prepare_domain_specific_bag_of_words_corpus',
]



[docs]
def prepare_domain_specific_bag_of_words_corpus(
    corpus: str,
    pattern: str = r'\w+',
    lemmatize: Literal['add', 'replace', 'no'] = 'add',
    wordfreq_threshold: float | None = 2,
    wordfreq_lang: str = 'ru',
    pbar: bool = False,
) -> set[str]:
    """ Extracts words from domain specific corpus or dictionary.
    
    For each word adds/replaces with lemmatized form. Filters out too
    frequent words based on wordfreq_threshold.
    """
    words: set[str] = set(re.findall(pattern, corpus.lower()))
    
    words |= {word.replace('ё', 'е') for word in words if 'ё' in word}
    
    if lemmatize != 'no':
        lemmatized_words = {
            lemmatize_ru(word)
            for word in tqdm(words, disable=not pbar, desc='lematizing')
        }
        match lemmatize:
            case 'add':
                words |= lemmatized_words
            case 'replace':
                words = lemmatized_words
    
    if wordfreq_threshold is not None:
        words = {
            word for word in tqdm(words, disable=not pbar, desc='wordfreq')
            if word_freq(word, lang=wordfreq_lang) < wordfreq_threshold
        }
    
    return words