|
@@ -0,0 +1,43 @@
|
|
|
|
+#!/bin/bash
|
|
|
|
+
|
|
|
|
+# CAMeL romanization language model setup
|
|
|
|
+# Adapted from https://github.com/fadhleryani/Arabic_ALA-LC_Romanization#data-1
|
|
|
|
+
|
|
|
|
+set -e
|
|
|
|
+
|
|
|
|
+BASEDIR=$( dirname -- "$( readlink -f -- "$0"; )")
|
|
|
|
+
|
|
|
|
+cd "${BASEDIR}/../../../ext/arabic_rom"
|
|
|
|
+
|
|
|
|
+# Download data
|
|
|
|
+if [ ! -d "./data/raw_records" ]; then
|
|
|
|
+ echo "Downloading data."
|
|
|
|
+ make download_data
|
|
|
|
+fi
|
|
|
|
+
|
|
|
|
+# Collect Arabic records
|
|
|
|
+echo "Collecting records."
|
|
|
|
+python src/data/collect_arabic_records.py data/raw_records/umich
|
|
|
|
+python src/data/collect_arabic_records.py data/raw_records/loc
|
|
|
|
+python src/data/collect_arabic_records.py data/raw_records/aco/work --sub_directory_filter marcxml_out
|
|
|
|
+
|
|
|
|
+# Extract parallel lines
|
|
|
|
+echo "Extracting lines."
|
|
|
|
+make extract_lines
|
|
|
|
+
|
|
|
|
+# Clean, preprocess, and split
|
|
|
|
+echo "Preprocessing data set."
|
|
|
|
+make data_set
|
|
|
|
+
|
|
|
|
+# Train MLE model
|
|
|
|
+echo "Bulding MLE simple rules."
|
|
|
|
+python src/loc_transcribe.py predict simple dev
|
|
|
|
+python3 src/loc_transcribe.py train mle --size {1,0.5,0.25,0.125,0.0625,0.03125,0.015625}
|
|
|
|
+python3 src/loc_transcribe.py predict mle dev --mle_model models/mle/size1.0.tsv --backoff predictions_out/simple/dev/simple.out
|
|
|
|
+#make predict_mle # NOTE this should replace the 2 lines above but there is no Makefile target.
|
|
|
|
+
|
|
|
|
+# Seq2Seq
|
|
|
|
+echo "Preparing Seq2seq."
|
|
|
|
+make prep_seq2seq
|
|
|
|
+echo "Training models."
|
|
|
|
+python3 src/loc_transcribe.py train seq2seq --train --size {1.0,0.5,0.25,0.125,0.0625,0.03125,0.015625}
|