1234567891011121314151617181920212223242526272829303132333435363738394041 |
- from camel_tools.utils.normalize import (
- normalize_alef_ar,
- normalize_alef_maksura_ar,
- normalize_teh_marbuta_ar,
- normalize_unicode,
- )
- from mishkal.tashkeel import TashkeelClass
- from scriptshifter.exceptions import BREAK
- vocalizer = TashkeelClass()
- def camel_normalize(ctx):
- """
- Normalize complex Arabic characters.
- Note: this doesn't normalize Persian (and probably other non-Arabic)
- letters, e.g. ﻯ (U+FEEF) → ی (U+06CC)
- """
- norm = normalize_unicode(ctx.src)
- norm = normalize_alef_ar(norm)
- norm = normalize_alef_maksura_ar(norm)
- norm = normalize_teh_marbuta_ar(norm)
- ctx._src = norm
- def tashkeel_vocalize(ctx):
- """
- Vocalize Arabic text by adding implicit vowel marks.
- Note that this changes the source text.
- """
- try:
- ctx._src = vocalizer.tashkeel(ctx.src)
- except Exception as e:
- ctx.warnings.append(f"Error vocalizing text: {e}")
- return BREAK
|