#include <sys/stat.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>
#include <math.h>
#include "fsm.h"
#include "refx.h"
#include "l2w.h"
#include "parse.h"
ea **reserva_memoria_trellis(int size) {
ea **lea;
memtest(lea = (ea **) malloc(2*sizeof(ea *)));
memtest(lea[0] = (ea *) malloc(size*sizeof(ea)));
memtest(lea[1] = (ea *) malloc(size*sizeof(ea)));
return lea;
}
void anyade_estados_iniciales(T_REDGEN *A, ea **lea, int *contea, T_REDGEN *B, T_REDGEN *WG) {
lea[0][0].estado = A->ini;
lea[0][0].score = 0;
lea[0][0].input = 0;
//lea[0][0].productividad = 0;
lea[0][0].output = NULL;
if (B != NULL) lea[0][0].lm_estado = B->ini;
else lea[0][0].lm_estado = 0;
(*contea)++;
lea[0][0].WG = 0;
lea[0][0].WG_from = -1;
if (WG != NULL) enter_state(WG, 0, 0, INT_MAX);
}
void escribe_lea(ea **lea, int which, int num, char **dictionary) {
int i;
char candidate[MAXLINE];
l_int *aux;
fprintf(stderr, "\nLista de Estados Activos (%d)\n", num);
for (i = 0; i < num; i++) {
fprintf(stderr, "%d + (%6d lm) (Score = %-8.6g) ::: Input = %2d ::: WG = %3d -> %3d ::: Output = ",
lea[which][i].estado, lea[which][i].lm_estado, lea[which][i].score, lea[which][i].input, lea[which][i].WG_from, lea[which][i].WG);
aux = lea[which][i].output;
while (aux != NULL) {
fprintf(stderr, "%d ", aux->n);
aux = aux->next;
}
fprintf(stderr, "\n");
//get_output(vocab, dictionary, lea[which][i].output, candidate);
//printf("%s\n", candidate);
//write_output(dictionary, lea[which][i].output, stderr);
}
fprintf(stderr, "\n");
fflush(stderr);
}
int inserta_ea(T_REDGEN *A, ea **lea, int start, int *contea, l_int *previous, T_ARISTA *a, double score, int *UNK, int n,
char *simb, int type, struct options *opt, char **tags, char **dict, h_t *table, h_t *vocab, int lm, T_REDGEN *WG, ea *ea_from, int ea_k, double w, int *contWG) {
int j, k, l;
int found = 0;
int flagUNK;
l_int input, *tmp;
ea aux;
int backup;
char candidate[MAXLINE];
// Busqueda del mismo estado y que tenga un mejor score
for (j = start; j < *contea && lea[1][j].score <= score; j++) {
//if ((lea[j].estado == a->dest) && (lea[j].input == n)) { /* PELIGRO :: REVISAR ESTA LINEA */
if ((lea[1][j].estado == a->dest) && (lea[1][j].lm_estado == lm) && (lea[1][j].input == n)) { /* PELIGRO :: REVISAR ESTA LINEA */
found = 1; /* QUIZA :: && BO >= lea[j].BO */
break; /* BORRAR pos J, INSERTAR en nueva POS */
}
}
//fprintf(stderr, "Le tocaría situarse en la posicion %d\n", j); fflush(stderr);
if (!found) {
// Busqueda del mismo estado y que tenga un peor score
// Si no lo encuentra, k se queda apuntando a la siguiente posición libre del vector
for (k = j; k < *contea; k++)
if ((lea[1][k].estado == a->dest) && (lea[1][k].lm_estado == lm) && (lea[1][k].input == n)) {
found = 1;
break;
}
if (found) {
backup = lea[1][k].WG;
//lea[0][lea[1][k].WG_from].productividad--;
free_output(lea[1][k].output);
}
// Desplazamiento: si k se sale del vector, el rango [j..k-2] => [j+1..k-1]
//fprintf(stderr, "Hay que correr desde %d..%d\n", j, k); fflush(stderr);
for (l = k; l > j; l--) {
if (l < opt->beam)
lea[1][l] = lea[1][l - 1];
else free_output(lea[1][l-1].output);
//printf("Escribiendo %d sobre %d: \"", l-1, l);
//write_output(dictionary, lea[l].output, stdout);
}
if (j < opt->beam) {
lea[1][j].estado = a->dest;
lea[1][j].score = score;
lea[1][j].input = n;
lea[1][j].lm_estado = lm;
//lea[1][j].productividad = 0;
lea[1][j].WG_from = ea_from->WG;
lea[1][j].arista = a;
lea[1][j].transicion = w;
//ea_from->productividad++;
if ((opt->reord) && (type != Backoff)) {
/* Procesar a->output = lema numero lema numero lema numero ...
*
* Coger cada lema, encontrar su tags[numero]
* consultar la tabla y emitir la salida */
//fprintf(stderr, "Palabra %s\n", simb);
//write_output(dict, a->output, stdout);
tmp = read_entry(a->output, dict, tags, n, table, vocab);
lea[1][j].output = append(previous, tmp, UNK, simb, OUT);
//printf("Simbolo %d: %s %s\n", *simb, words[*simb], tags[*simb]);
/* input.n = a->input;
input.next = NULL;
tmp = append(previous, &input, UNK, simb, IN);
lea[j].output = append(tmp, a->output, UNK, simb, OUT); */
free_output(tmp);
}
else lea[1][j].output = append(previous, a->output, UNK, simb, OUT);
if (found) // HAY UNA HIPOTESIS PEOR QUE LA ACTUAL
lea[1][j].WG = backup;
else if (*contea < opt->beam) { // ES UNA HIPOTESIS NUEVA Y CABE EN EL BEAM
lea[1][j].WG = *contWG;
(*contWG)++;
(*contea)++;
}
else { // EL BEAM ESTA LLENO Y LA HIPOTESIS NUEVA DESPLAZA A LA ULTIMA
// DE LA CUAL HAY QUE ELIMINAR TODOS LOS ARCOS QUE LLEGABAN A ELLA
lea[1][j].WG = lea[1][k-1].WG;
//lea[0][lea[1][k-1].origen].productividad--;
}
}
}
return j;
}
int actualiza_trellis(T_REDGEN *A, T_REDGEN *WG, ea **lea, int source, int cont, int target, int contea, int etapa) {
int i, j, contador = 0;
l_int *p, *pp;
char candidate[MAXLINE];
for (j = 0; j < cont; j++) {
free_output(lea[source][j].output);
lea[source][j].output = NULL;
}
for (i = j = 0; j < contea; j++)
if (lea[target][j].input == etapa) {
lea[source][i] = lea[target][j];
i++;
if (WG != NULL) {
strcpy(candidate, " ");
get_output(A->alfa_in, A->alfa_out, lea[target][j].arista->output, candidate);
//fprintf(stderr, "Insertando en WG... %d -> %d %g\n", lea[target][j].WG_from, lea[target][j].WG, lea[target][j].transicion);
enter_transition(WG, NULL, 0, Training, Transit, lea[target][j].WG_from, lea[target][j].WG, candidate, lea[target][j].transicion, NULL);
}
}
else {
lea[target][contador] = lea[target][j];
contador++;
}
return contador;
}
void free_trellis(ea **lea) {
free(lea[0]);
free(lea[1]);
//free(lea[2]);
free(lea);
}
l_int *addlista(int state, l_int *lista) {
l_int *new, *aux;
new = (l_int *) malloc(sizeof(l_int));
new->n = state;
//new->word = NULL;
new->next = lista;
return new;
}
void freelista(T_REDGEN *A, l_int *lista) {
l_int *aux, *before;
aux = lista;
while (aux != NULL) {
A->est[aux->n].active = 1;
before = aux;
aux = aux->next;
free(before);
}
}
void printlista(l_int *lista) {
l_int *aux;
fprintf(stderr, "------------\n");
aux = lista;
while (aux != NULL) {
fprintf(stderr, "ESTADO %d\n", aux->n);
aux = aux->next;
}
fprintf(stderr, "------------\n");
}
double newscore(double oldscore, T_ARISTA *a, struct options *opt,
T_REDGEN *A, T_REDGEN *B, int *lm_estado, char *simb, int *UNK) {
double score = 0;
int i;
//fprintf(stderr, "LM state = %d\n", *lm_estado);
for (i = 0; i < opt->ngram && i < a->np; i++)
if ((opt->l[i] != 0) && (a->prob[i] == INT_MAX)) return -1;
else score = score + opt->l[i]*a->prob[i];
//fprintf(stderr, "TR score = %f\n", score);
if (B != NULL)
if (a->output != NULL) score = score + opt->norm*miniparselm(a->output, A, B, lm_estado, simb, UNK);
else score = score + opt->norm*B->est[*lm_estado].final;
//fprintf(stderr, "TR+LM score = %f\n", score);
score = oldscore + score;
return score;
}
h_t *explore(T_REDGEN *A, int state, int **index, int i, int n, double oldscore, l_int *lista, int save,
ea **lea, int *contea, l_int *previous, int *UNK, char *simb, struct options *opt,
char **tags, h_t *table, h_t *vocab, int lm_estado, T_REDGEN *B, T_REDGEN *WG, int *contWG, int k) {
char key[MAXLINE], key2[MAXLINE], key3[MAXLINE];
T_ARISTA *a, *aa;
int j = i, z, mini = state;
double score, *pt;
h_t *aux = NULL, *below;
he_t *p,*pp;
int lm = lm_estado;
int found = 0;
//fprintf(stderr, "Estado %d/%d...\n", state, lm_estado);
do {
if (index[j] != (int *) -1) {
sprintf(key, "%d %d", mini, *(index[j]));
a = hs(key, strlen(key)+1, A->table);
if (a != (T_ARISTA *) -1) {
//fprintf(stderr, "Arista compatible: %d => %d (%g)\n",
// a->origen, a->dest, A->est[a->dest].final);
for (aa = NULL; a != NULL; a = a->siguiente) {
lm = lm_estado;
if (A->est[a->dest].final < INT_MAX) {
score = newscore(oldscore, a, opt, A, B, &lm, simb, UNK);
if (score >= 0) {
found = 1;
if (A->est[a->dest].active) {
if ((opt->algorithm) && (!((state == 0) && (a->dest == 0)))) {
z = a->dest;
while (z != 0) {
lista = addlista(z, lista);
A->est[z].active = 0;
z = A->est[z].BACKOFF->dest;
}
}
//fprintf(stderr, "--->Encontrada: %d => %d (WG = %d)\n",
// a->origen, a->dest, *contWG); fflush(stderr);
inserta_ea(A, lea, 0, contea, previous, a, score, UNK, j, simb,
Transit, opt, tags, A->alfa_out, table, vocab, lm,
WG, &lea[0][k], k, score-oldscore, contWG);
//fprintf(stderr, "contea = %d\n", *contea); fflush(stderr);
//(*contWG)++;
//enter_transition(WG, Training, Transit, lea[0][k].WG, contWG, "hola hello", score-oldscore, NULL);
}
}
}
else aa = a;
/* if (((*contea == 0) && (*contaux == 0)) ||
((*contea == 0) && (score < 3.5*lea[2][0].score)) ||
((*contea > 0) && (score < 3.5*lea[1][0].score))) { // Factor de DOUBLE BEAM */
//if (score >= 0) {
}
j++;
if (aa != NULL) mini = aa->dest;
}
}
} while ((a != (T_ARISTA *) -1) && (aa != NULL) && (j < n) && (index[j] != (int *) -1));
//printlista(lista);
if (opt->algorithm || !found) {
if ((a = A->est[state].BACKOFF) != NULL) {
//fprintf(stderr, "Explorando BACKOFF\n");
score = newscore(oldscore, a, opt, NULL, NULL, &lm_estado, NULL, UNK);
below = explore(A, a->dest, index, i, n, score, lista, 0,
lea, contea, previous, UNK, simb, opt, tags, table, vocab, lm_estado, B, WG, contWG, k);
}
else {
if (((a = A->est[state].UNKNOWN) != NULL) && !found) {
//fprintf(stderr, "Palabra desconocida en el TR\n"); fflush(stderr);
score = newscore(oldscore, a, opt, NULL, B, &lm, simb, UNK);
inserta_ea(A, lea, 0, contea, previous, a, score, UNK, i,
simb, Unknown, opt, tags, A->alfa_out, table, vocab, lm,
WG, &lea[0][k], k, a->prob[0], contWG);
}
freelista(A, lista);
}
}
return aux;
}
l_int *parse(T_REDGEN *A, int *UNK, char **source, char *corpus, int n, ea **lea, struct options *opt, char *eagles, h_t *table, h_t *vocab, T_REDGEN *B, int contador) {
char simb[MAXLINE], key[MAXLINE], key2[MAXLINE], key3[MAXLINE], *next, *next2, *next3;
char *tags[MAXLINE];
char wordgraph[MAXLINE];
int **index;
int k, cont, contea, contWG = 1;
int e, i, j, l, kept;
T_ARISTA *a, *aa;
double score, *pt;
//int found;
l_int *output;
he_t *p;
T_REDGEN *WG = NULL;
memtest(index = (int **) malloc(n*sizeof(int *)));
for (i = 1; i < n; i++) index[i] = hs(source[i], strlen(source[i])+1, A->table_in);
cont = contea = 0;
if (opt->wordgraphs) WG = CrearRed(opt->beam*(n-1)+2, opt->hts);
anyade_estados_iniciales(A, lea, &cont, B, WG);
next2 = eagles; next3 = corpus;
for (i = 1; i < n; i++) {
escribe_lea(lea, 0, cont, A->alfa_out);
strcpy(simb, source[i]);
fprintf(stderr, "Parseando símbolo %d/%d (%s)...\n", i, n-1, source[i]); fflush(stderr);
if (opt->reord == 1) {
sscanf( (const char *) next2, "%s", simb);
tags[i] = strdup(simb);
sscanf( (const char *) next3, "%s", simb);
}
for (k = 0; k < cont; k++) {
//fprintf(stderr, "Estado %d/%d...\n", lea[0][k].estado, lea[0][k].lm_estado);
e = lea[0][k].estado;
explore(A, e, index, i, n, lea[0][k].score, NULL, 1,
lea, &contea, lea[0][k].output, UNK, simb, opt, tags, table, vocab,
lea[0][k].lm_estado, B, WG, &contWG, k);
}
kept = actualiza_trellis(A, WG, lea, 0, cont, 1, contea, i);
//escribe_lea(lea, 0, cont, A->alfa_out);
cont = contea - kept;
contea = kept;
//escribe_lea(lea, 0, cont, A->alfa_out);
escribe_lea(lea, 1, contea, A->alfa_out);
/* get_vocabulary(WG, IN);
get_vocabulary(WG, OUT);
write_fsm(WG, "/tmp/kk.wordgraph", opt->ngram, opt);
free_fsm(WG);
exit(1); */
/***********************************************************************************
* Para toda (word/*kkindex)
* Para todo estado activo € LEA.OLD
* Para toda arista compatible (Estado + Simbolo)
* score = score(Estado) + peso(Arista) (en -log10)
* HAY QUE MINIMIZAR LOS SCORES
* if (score < SCORE(LEA.NEW, Arista->Destino)
* Si no hubiera ninguna arista compatible AND existe arista <unk>
* score = score(Estado) + peso(Arista) (en -log10)
* HAY QUE MINIMIZAR LOS SCORES
* if (score < SCORE(LEA.NEW, Arista->Destino)
* Si no hubiera ninguna arista compatible AND existe arista BO
* se añade en esta misma etapa (LEA.OLD)
* el estado destino de la transicion de BO
* sumandole el peso de la arista
* HISTORIC.append(LEA.OLD)
* LEA.OLD <= los 50(beam) mejores de LEA.NEW (los más pequeños)
***********************************************************************************/
/* next = strchr(next, ' ');
if (next != NULL) next++; */
if (opt->reord) {
next2 = strchr(next2, ' ');
if (next2 != NULL) next2++;
next3 = strchr(next3, ' ');
if (next3 != NULL) next3++;
}
}
escribe_lea(lea, 0, cont, A->alfa_out);
memtest(aa = (T_ARISTA *) malloc(sizeof(T_ARISTA)));
aa->output = NULL;
aa->np = 1;
for (k = 0; k < cont; k++) {
e = lea[0][k].estado;
aa->prob[0] = A->est[e].final;
//score = lea[0][k].score + A->est[e].final;
score = newscore(lea[0][k].score, aa, opt, NULL, B, &lea[0][k].lm_estado, NULL, UNK);
aa->dest = e;
inserta_ea(A, lea, 0, &contea, lea[0][k].output, aa, score, UNK, lea[0][k].input,
NULL, Backoff, opt, tags, A->alfa_out, table, vocab, lea[0][k].lm_estado, WG, &lea[0][k], k, score-lea[0][k].score, &contWG);
contWG--;
}
//escribe_lea(lea, 1, contea, A->alfa_out);
/************************* PELIGRO *************************/
if (cont > 0) output = append(lea[1][0].output, aa->output, UNK, NULL, OUT);
else output = NULL;
/************************* PELIGRO *************************/
free(aa);
actualiza_trellis(A, WG, lea, 0, cont, 1, contea, n-1);
cont = contea;
contea = 0;
fprintf(stderr, "Sumando la probabilidad de estado final...\n");
escribe_lea(lea, 0, cont, A->alfa_out);
if (opt->wordgraphs) {
enter_state(WG, contWG, INT_MAX, 0);
WG->ne = contWG+1;
}
/* Para liberar el output final en lea[0] */
actualiza_trellis(A, WG, lea, 0, cont, 1, contea, n-1);
if (opt->reord) for (k = 1; k < n; k++) free(tags[k]);
free(index);
if (opt->wordgraphs) {
get_vocabulary(WG, IN);
get_vocabulary(WG, OUT);
sprintf(wordgraph, "%s.WGs", opt->test);
mkdir(wordgraph, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
sprintf(wordgraph, "%s.WGs/sentence%d.v2", opt->test, contador);
write_fsm(WG, wordgraph, opt->ngram, opt);
free_fsm(WG);
}
return output;
}