12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- #!/bin/bash
- # CAMeL romanization language model setup
- # Adapted from https://github.com/fadhleryani/Arabic_ALA-LC_Romanization#data-1
- set -e
- BASEDIR=$( dirname -- "$( readlink -f -- "$0"; )")
- cd "${BASEDIR}/../../../ext/arabic_rom"
- # Download data
- if [ ! -d "./data/raw_records" ]; then
- echo "Downloading data."
- make download_data
- fi
- # Collect Arabic records
- echo "Collecting records."
- python src/data/collect_arabic_records.py data/raw_records/umich
- python src/data/collect_arabic_records.py data/raw_records/loc
- python src/data/collect_arabic_records.py data/raw_records/aco/work --sub_directory_filter marcxml_out
- # Extract parallel lines
- echo "Extracting lines."
- make extract_lines
- # Clean, preprocess, and split
- echo "Preprocessing data set."
- make data_set
- # Train MLE model
- echo "Bulding MLE simple rules."
- python src/loc_transcribe.py predict simple dev
- python3 src/loc_transcribe.py train mle --size {1,0.5,0.25,0.125,0.0625,0.03125,0.015625}
- python3 src/loc_transcribe.py predict mle dev --mle_model models/mle/size1.0.tsv --backoff predictions_out/simple/dev/simple.out
- #make predict_mle # NOTE this should replace the 2 lines above but there is no Makefile target.
- # Seq2Seq
- echo "Preparing Seq2seq."
- make prep_seq2seq
- echo "Training models."
- python3 src/loc_transcribe.py train seq2seq --train --size {1.0,0.5,0.25,0.125,0.0625,0.03125,0.015625}
|