normalize.py 965 B

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. from camel_tools.utils.normalize import (
  2. normalize_alef_ar,
  3. normalize_alef_maksura_ar,
  4. normalize_teh_marbuta_ar,
  5. normalize_unicode,
  6. )
  7. from mishkal.tashkeel import TashkeelClass
  8. from scriptshifter.exceptions import BREAK
  9. vocalizer = TashkeelClass()
  10. def camel_normalize(ctx):
  11. """
  12. Normalize complex Arabic characters.
  13. Note: this doesn't normalize Persian (and probably other non-Arabic)
  14. letters, e.g. ﻯ (U+FEEF) → ی (U+06CC)
  15. """
  16. norm = normalize_unicode(ctx.src)
  17. norm = normalize_alef_ar(norm)
  18. norm = normalize_alef_maksura_ar(norm)
  19. norm = normalize_teh_marbuta_ar(norm)
  20. ctx._src = norm
  21. def tashkeel_vocalize(ctx):
  22. """
  23. Vocalize Arabic text by adding implicit vowel marks.
  24. Note that this changes the source text.
  25. """
  26. try:
  27. ctx._src = vocalizer.tashkeel(ctx.src)
  28. except Exception as e:
  29. ctx.warnings.append(f"Error vocalizing text: {e}")
  30. return BREAK