Kaynağa Gözat

WIP add steps for model building and trainig.

scossu 1 yıl önce
ebeveyn
işleme
7d28d8d36c
1 değiştirilmiş dosya ile 43 ekleme ve 0 silme
  1. 43 0
      scriptshifter/hooks/arabic/build_model.sh

+ 43 - 0
scriptshifter/hooks/arabic/build_model.sh

@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# CAMeL romanization language model setup
+# Adapted from https://github.com/fadhleryani/Arabic_ALA-LC_Romanization#data-1
+
+set -e
+
+BASEDIR=$( dirname -- "$( readlink -f -- "$0"; )")
+
+cd "${BASEDIR}/../../../ext/arabic_rom"
+
+# Download data
+if [ ! -d "./data/raw_records" ]; then
+    echo "Downloading data."
+    make download_data
+fi
+
+# Collect Arabic records
+echo "Collecting records."
+python src/data/collect_arabic_records.py data/raw_records/umich
+python src/data/collect_arabic_records.py data/raw_records/loc
+python src/data/collect_arabic_records.py data/raw_records/aco/work --sub_directory_filter marcxml_out
+
+# Extract parallel lines
+echo "Extracting lines."
+make extract_lines
+
+# Clean, preprocess, and split
+echo "Preprocessing data set."
+make data_set
+
+# Train MLE model
+echo "Bulding MLE simple rules."
+python src/loc_transcribe.py predict simple dev
+python3 src/loc_transcribe.py train mle --size {1,0.5,0.25,0.125,0.0625,0.03125,0.015625}
+python3 src/loc_transcribe.py predict mle dev --mle_model models/mle/size1.0.tsv --backoff predictions_out/simple/dev/simple.out
+#make predict_mle  # NOTE this should replace the 2 lines above but there is no Makefile target.
+
+# Seq2Seq
+echo "Preparing Seq2seq."
+make prep_seq2seq
+echo "Training models."
+python3 src/loc_transcribe.py train seq2seq --train --size {1.0,0.5,0.25,0.125,0.0625,0.03125,0.015625}