|
@@ -0,0 +1,41 @@
|
|
|
+from camel_tools.utils.normalize import (
|
|
|
+ normalize_alef_ar,
|
|
|
+ normalize_alef_maksura_ar,
|
|
|
+ normalize_teh_marbuta_ar,
|
|
|
+ normalize_unicode,
|
|
|
+)
|
|
|
+from mishkal.tashkeel import TashkeelClass
|
|
|
+
|
|
|
+from scriptshifter.exceptions import BREAK
|
|
|
+
|
|
|
+
|
|
|
+vocalizer = TashkeelClass()
|
|
|
+
|
|
|
+
|
|
|
+def camel_normalize(ctx):
|
|
|
+ """
|
|
|
+ Normalize complex Arabic characters.
|
|
|
+
|
|
|
+ Note: this doesn't normalize Persian (and probably other non-Arabic)
|
|
|
+ letters, e.g. ﻯ (U+FEEF) → ی (U+06CC)
|
|
|
+ """
|
|
|
+ norm = normalize_unicode(ctx.src)
|
|
|
+ norm = normalize_alef_ar(norm)
|
|
|
+ norm = normalize_alef_maksura_ar(norm)
|
|
|
+ norm = normalize_teh_marbuta_ar(norm)
|
|
|
+
|
|
|
+ ctx._src = norm
|
|
|
+
|
|
|
+
|
|
|
+def tashkeel_vocalize(ctx):
|
|
|
+ """
|
|
|
+ Vocalize Arabic text by adding implicit vowel marks.
|
|
|
+
|
|
|
+ Note that this changes the source text.
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ ctx._src = vocalizer.tashkeel(ctx.src)
|
|
|
+ except Exception as e:
|
|
|
+ ctx.warnings.append(f"Error vocalizing text: {e}")
|
|
|
+
|
|
|
+ return BREAK
|