build_model.sh 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. #!/bin/bash
  2. # CAMeL romanization language model setup
  3. # Adapted from https://github.com/fadhleryani/Arabic_ALA-LC_Romanization#data-1
  4. set -e
  5. BASEDIR=$( dirname -- "$( readlink -f -- "$0"; )")
  6. cd "${BASEDIR}/../../../ext/arabic_rom"
  7. # Download data
  8. if [ ! -d "./data/raw_records" ]; then
  9. echo "Downloading data."
  10. make download_data
  11. fi
  12. # Collect Arabic records
  13. echo "Collecting records."
  14. python src/data/collect_arabic_records.py data/raw_records/umich
  15. python src/data/collect_arabic_records.py data/raw_records/loc
  16. python src/data/collect_arabic_records.py data/raw_records/aco/work --sub_directory_filter marcxml_out
  17. # Extract parallel lines
  18. echo "Extracting lines."
  19. make extract_lines
  20. # Clean, preprocess, and split
  21. echo "Preprocessing data set."
  22. make data_set
  23. # Train MLE model
  24. echo "Bulding MLE simple rules."
  25. python src/loc_transcribe.py predict simple dev
  26. python3 src/loc_transcribe.py train mle --size {1,0.5,0.25,0.125,0.0625,0.03125,0.015625}
  27. python3 src/loc_transcribe.py predict mle dev --mle_model models/mle/size1.0.tsv --backoff predictions_out/simple/dev/simple.out
  28. #make predict_mle # NOTE this should replace the 2 lines above but there is no Makefile target.
  29. # Seq2Seq
  30. echo "Preparing Seq2seq."
  31. make prep_seq2seq
  32. echo "Training models."
  33. python3 src/loc_transcribe.py train seq2seq --train --size {1.0,0.5,0.25,0.125,0.0625,0.03125,0.015625}