1 year ago · 7d28d8d36c
--- a/scriptshifter/hooks/arabic/build_model.sh
+++ b/scriptshifter/hooks/arabic/build_model.sh
@@ -0,0 +1,43 @@
 
															+#!/bin/bash
														
 
															+
														
 
															+# CAMeL romanization language model setup
														
 
															+# Adapted from https://github.com/fadhleryani/Arabic_ALA-LC_Romanization#data-1
														
 
															+
														
 
															+set -e
														
 
															+
														
 
															+BASEDIR=$( dirname -- "$( readlink -f -- "$0"; )")
														
 
															+
														
 
															+cd "${BASEDIR}/../../../ext/arabic_rom"
														
 
															+
														
 
															+# Download data
														
 
															+if [ ! -d "./data/raw_records" ]; then
														
 
															+    echo "Downloading data."
														
 
															+    make download_data
														
 
															+fi
														
 
															+
														
 
															+# Collect Arabic records
														
 
															+echo "Collecting records."
														
 
															+python src/data/collect_arabic_records.py data/raw_records/umich
														
 
															+python src/data/collect_arabic_records.py data/raw_records/loc
														
 
															+python src/data/collect_arabic_records.py data/raw_records/aco/work --sub_directory_filter marcxml_out
														
 
															+
														
 
															+# Extract parallel lines
														
 
															+echo "Extracting lines."
														
 
															+make extract_lines
														
 
															+
														
 
															+# Clean, preprocess, and split
														
 
															+echo "Preprocessing data set."
														
 
															+make data_set
														
 
															+
														
 
															+# Train MLE model
														
 
															+echo "Bulding MLE simple rules."
														
 
															+python src/loc_transcribe.py predict simple dev
														
 
															+python3 src/loc_transcribe.py train mle --size {1,0.5,0.25,0.125,0.0625,0.03125,0.015625}
														
 
															+python3 src/loc_transcribe.py predict mle dev --mle_model models/mle/size1.0.tsv --backoff predictions_out/simple/dev/simple.out
														
 
															+#make predict_mle  # NOTE this should replace the 2 lines above but there is no Makefile target.
														
 
															+
														
 
															+# Seq2Seq
														
 
															+echo "Preparing Seq2seq."
														
 
															+make prep_seq2seq
														
 
															+echo "Training models."
														
 
															+python3 src/loc_transcribe.py train seq2seq --train --size {1.0,0.5,0.25,0.125,0.0625,0.03125,0.015625}