Source code for asr_eval.align.parsing

from __future__ import annotations
from collections.abc import Iterable
from dataclasses import dataclass
import re
import string
from typing import Callable

from asr_eval.align.transcription import (
    Wildcard,
    Transcription,
    SingleVariantTranscription,
    Token,
    MultiVariantBlock,
)


__all__ = [
    "Parser",
    "DEFAULT_PARSER",
    "PUNCT",
]


PUNCT = re.escape(r""".,!?:;…-‑–—'"‘“”«»()[]{}""")
r"""
A default set of punctuation characters to exclude from words
:code:`.,!?:;…-‑–—'"‘“”«»()[]{}`. Note that this does not affect parsing
multivariant syntax. To override, create a
:class:`~asr_eval.align.parsing.Parser` with custom
:attr:`~asr_eval.align.parsing.Parser.tokenizing` field.

:meta hide-value:
"""


[docs] @dataclass class Parser: r"""Parses into words and (optionally) normalizes prediction or annotation. Performs the following: 1. Preprocesses the whole text if :attr:`~asr_eval.align.parsing.Parser.preprocessing` is set. This stage is suitable for various normalization methods, if they are used, such as numerals-to-digits normalizers or filler words removers. 2. If :meth:`~asr_eval.align.parsing.Parser.parse_transcription`, is called, processes multivariant syntax. 3. Splits all the text blocks into words with a regexp stored in the :attr:`~asr_eval.align.parsing.Parser.tokenizing` attribute. 4. Postprocesses each word if :attr:`~asr_eval.align.parsing.Parser.postprocessing` is set. This stage is suitable for lowercase conversion. A :const:`~asr_eval.align.parsing.DEFAULT_PARSER` is an instance of the Parser with default parameters. Example: >>> from asr_eval.align.parsing import DEFAULT_PARSER # same as Parser() >>> text = 'Hi there {fouth|4|t-th} {eh} <*>' >>> parsed = DEFAULT_PARSER.parse_transcription(text) >>> print(parsed.blocks) # doctest: +NORMALIZE_WHITESPACE (Token(hi), Token(there), MultiVariantBlock([Token(fouth)], [Token(4)], [Token(t), Token(th)]), MultiVariantBlock([Token(eh)], []), Token(Wildcard())) >>> from dataclasses import asdict >>> asdict(parsed.blocks[0]) # doctest: +NORMALIZE_WHITESPACE {'value': 'hi', 'uid': 'id0', 'start_pos': 0, 'end_pos': 2, 'start_time': nan, 'end_time': nan} >>> print(parsed.colorize()) # doctest: +SKIP .. raw:: html <style>.y {background-color: #e0e841;} .g {background-color: #41e8a8;} .b {background-color: #41c7e8 ;}</style> <span style="white-space='pre'; font-family: 'Consolas', 'Ubuntu Mono', 'Monaco', monospace"> <span class="y">Hi</span> <span class="g">there</span> {<span class="b">fourth</span>|<span class="y">4</span>|<span class="g">4</span>-<span class="b">th</span>} {<span class="y">eh</span>} <span class="g"><*></span></span> Note: 1. Why not just :code:`nltk.word_tokenize`? In *asr_eval* words keep references to their positions in the original text, which :code:`word_tokenize` does not support. 2. By making a Parser with :code:`tokenizing=r'\\w|\\s|[^\\w\\s{PUNCT}]'` you can parse strings into characters, excluding punctuation. In this case, :class:`~asr_eval.align.alignment.Alignment` will calculate CER (character error rate) instead of WER. 3. You can create named parsers in :mod:`asr_eval.bench.parsers`. 4. When labeling a dataset, the annotator should be aware of the tokenization scheme. For example, if :code:`3/4$` is tokenized as a single word, then :code:`3/4$` and :code:`3 / 4 $` (with spaces) are different options, and both should be included in a multivariant block. See :doc:`/guide_alignment_wer` for details. """ tokenizing: str = rf'\w+|[^\w\s{PUNCT}]+' r"""A regexp to extract word, by default :code:`\\w+|[^\\w\\s{PUNCT}]+`, where :const:`~asr_eval.align.parsing.PUNCT` are punctuation characters. :meta hide-value: """ preprocessing: Callable[[str], str] = lambda text: text """A text preprocessing method set as :code:`Callable[[str], str]`, by default does nothing. Is suitable for text-to-text operations such as normalizers or filler word removers. Note that after parsing the :attr:`~asr_eval.align.transcription.Transcription.text` field in :class:`~asr_eval.align.transcription.Transcription` willcontain the preprocessed version, and the original version will be gone. Example: >>> from asr_eval.align.parsing import Parser >>> import re >>> def filler_remover(text: str) -> str: ... for word in 'eh', 'oh', 'umm': ... text = re.sub(word, '', text, flags=re.IGNORECASE) ... return text >>> parser = Parser(preprocessing=filler_remover) >>> parsed = parser.parse_transcription('Umm eh of course') >>> print(parsed.text, parsed.blocks) of course [Token(of), Token(course)] See more examples in :mod:`asr_eval.bench.parsers`. :meta hide-value: """ postprocessing: Callable[[str], str] = ( lambda text: text.lower().replace('ё', 'е') ) """ A word postprocessing method set as :code:`Callable[[str], str]`, by default performs lowercase and diacritic conversion: .. code-block:: python postprocessing=lambda text: text.lower().replace('ё', 'е') Will only affect the :attr:`~asr_eval.align.transcription.Token.value` field in :class:`~asr_eval.align.transcription.Token`. This is useful to match lowercase words, while tracking their positions in the original :attr:`~asr_eval.align.transcription.Transcription.text` with capitalization and punctuation. :meta hide-value: """
[docs] def parse_single_variant_transcription( self, text: str ) -> SingleVariantTranscription: """Parses a text without multivariant blocks. In general, one needs this method for typing purposes only, because :meth:`~asr_eval.align.parsing.Parser.parse_transcription` supports both multivariant and single-variant transcriptions. """ text = self.preprocessing(text) tokens = self._split_text_into_tokens(text) result = SingleVariantTranscription(text, tuple(tokens)) for i, t in enumerate(result.list_all_tokens()): t.uid = 'id' + str(i) return result
[docs] def parse_transcription(self, text: str) -> Transcription: """Parses a text possibly containing multivariant blocks. See example in the class docstring. """ # We can also parse multivariant strings with pyparsing: # from pyparsing import CharsNotIn, OneOrMore, Suppress as S, Group, # Empty, ZeroOrMore # WORDS = CharsNotIn('{|}\n')('words') # OPTION = Group(WORDS | Empty())('option') # MULTI = Group(S('{') + OPTION + OneOrMore(S('|') + OPTION) \ # + S('}'))('multi') # MV_STRING = ZeroOrMore(MULTI | WORDS) # results = MV_STRING.parse_string('{a|b} ! {1|2 3|} x y {3|4}', # parse_all=True) # print(results.as_list()) # however, this is not obvious for ones who are not familiar with # pyparsing, and also gives uninformative parsing errors MULTIVARIANT_PATTERN = re.compile( r'({[^{}]*?})' # multi variant '|' r'(?<=})([^{}]+?)(?={)' # single variant ) text = self.preprocessing(text) blocks: list[Token | MultiVariantBlock] = [] for match in re.finditer(MULTIVARIANT_PATTERN, '}' + text + '{'): text_part = match.group() start = match.start() - 1 # account for '}' (see in re.finditer) end = match.end() - 1 # account for '}' (see in re.finditer) if text_part.startswith('{'): if start > 0: assert (c := text[start - 1]) in string.whitespace, ( f'put a space before a multivariant block, got "{c}"' ) if end < len(text): assert (c := text[end]) in string.whitespace, ( f'put a space after a multivariant block, got "{c}"' ) # options_raw: (option text, start pos) options_raw: list[tuple[str, int]] = [] for option_match in re.finditer( r'([^\|]*)\|', text_part[1:-1] + '|' ): option_text = option_match.group(1) start_pos = start + option_match.start() + 1 if option_text.strip().startswith('~'): # lexically wrong but acceptable form option_text = option_text.strip()[1:] if (match2 := re.match( r'([^\<]+)\<(\w+)\>', option_text.strip() )) is not None: # forms like Facebook<е> # TODO: ambiguous: need to add empty option to # {Facebook<е>} ?? # TODO: handle this in single-variant blocks base, suffix = match2.groups() options_raw.append((f'{base}', start_pos)) options_raw.append((f'{base}-{suffix}', start_pos)) else: options_raw.append((option_text, start_pos)) options: list[list[Token]] = [] for option_text, start_pos in options_raw: option_tokens = self._split_text_into_tokens(option_text) _shift_tokens_inplace(option_tokens, start_pos) options.append(option_tokens) if len(options) == 1: assert len(options[0]), 'empty multivariant block' options.append([]) blocks.append(MultiVariantBlock( options=options, start_pos=start, end_pos=end, )) else: new_tokens = self._split_text_into_tokens(text_part) _shift_tokens_inplace(new_tokens, shift=start) blocks += new_tokens result = Transcription(text, tuple(blocks)) for i, t in enumerate(result.list_all_tokens()): t.uid = 'id' + str(i) i = 0 for block in result.blocks: if isinstance(block, MultiVariantBlock): block.uid = 'mvid' + str(i) i += 1 return result
def _split_text_into_tokens(self, text: str) -> list[Token]: """Finds words in the text and return them as a list of Token. """ tokens = list(_regexp_split_text_into_tokens( text, {'word': self.tokenizing} )) for token in tokens: if not isinstance(token.value, Wildcard): token.value = self.postprocessing(token.value) return tokens
DEFAULT_PARSER = Parser() """An instance of :class:`~asr_eval.align.parsing.Parser` with default parameters. :meta hide-value: """ def _shift_tokens_inplace(tokens: list[Token], shift: int = 0): for t in tokens: t.start_pos += shift t.end_pos += shift def _regexp_split_text_into_tokens( text: str, patterns: dict[str, str] ) -> Iterable[Token]: """Searches sequentially for any of the given patterns. For each match returns a :class:`~asr_eval.align.transcription.Token`. """ pattern = '|'.join( f'(?P<{name}>{subpattern})' for name, subpattern in patterns.items() ) for match in re.finditer( re.compile(pattern, re.MULTILINE|re.DOTALL|re.UNICODE), text ): found_groups = [ (name, substr) for name, substr in match.groupdict().items() if substr is not None ] assert len(found_groups) == 1 name, word = found_groups[0] assert name in patterns yield Token( value=Wildcard() if word == Wildcard._SYMBOL else word, # pyright: ignore[reportPrivateUsage] start_pos=match.start(), end_pos=match.end(), )