#!/bin/bash
# Uso: get_moses_scores.sh <phrase-table.gz> <output.vocabext> <output.arpa>
zcat $1 | sed "s/|||/_#_/g" |
awk 'BEGIN{FS = " _#_ "}{
gsub(" ","B",$1);
n=gsub(" ","A",$2);
printf("%d %sA%s %s %f\n", NR,$1,$2,$5,exp(n+1))}' > /tmp/kk
unigrams=`tail -1 /tmp/kk | awk '{print $1}'`
awk '{print $1,$2,$4,$5,$6,$7,$8}' /tmp/kk > $2
awk -v t=$unigrams 'BEGIN{
print "";
print "\\data\\";
printf("ngram 1=%d\n", t+3);
print "";
print "\\1-grams:";
}{
printf("%f\t%d\n", log($3)/log(10),NR)}
END{
printf("0.000000\t</s>\n");
printf("-99\t<s>\n");
printf("0.000000\t<unk>\n");
print "";
print "\\end\\";
}' /tmp/kk > $3