IRST LM Toolkit Code

A toolkit for language modeling.

Status: Planning
Brought to you by: mfederico, nicolabertoldi
[r860]: / trunk / src / cswam.cpp Maximize Restore History
638 lines (483 with data), 19.1 kB

/******************************************************************************
 IrstLM: IRST Language Model Toolkit, compile LM
 Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
 
 This library is free software; you can redistribute it and/or
 modify it under the terms of the GNU Lesser General Public
 License as published by the Free Software Foundation; either
 version 2.1 of the License, or (at your option) any later version.
 
 This library is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 Lesser General Public License for more details.
 
 You should have received a copy of the GNU Lesser General Public
 License along with this library; if not, write to the Free Software
 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
 
 *******************************************************************************/

#include <sys/mman.h>
#include <stdio.h>
#include <cmath>
#include <limits>
#include <string>
#include <sstream>
#include <pthread.h>
#include "thpool.h"
#include "mfstream.h"
#include "mempool.h"
#include "htable.h"
#include "n_gram.h"
#include "util.h"
#include "dictionary.h"
#include "ngramtable.h"
#include "doc.h"
#include "cswam.h"

#define BUCKET 1000
#define SSEED 1.0

using namespace std;

#define MY_RAND (((float)random()/RAND_MAX)* 2.0 - 1.0)
	
cswam::cswam(char* sdfile,char *tdfile, char* w2vfile,bool usenull, bool normvect,bool scalevect,bool trainvar){
    
    //create dictionaries
    srcdict=new dictionary(NULL,100000); srcdict->generate(sdfile,true);
    trgdict=new dictionary(NULL,100000); trgdict->generate(tdfile,true);
    
    //make aware of oov word
    srcdict->encode(srcdict->OOV());
    trgdict->encode(trgdict->OOV());
    
    //load word2vec dictionary
    W2V=NULL;D=0;
    loadword2vec(w2vfile);

    //check consistency of word2vec with target vocabulary
    
    //actual model structure
    S=NULL;
    M=NULL;
    A=NULL;
    
    
    normalize_vectors=normvect;
    scale_vectors=scalevect;
    train_variances=trainvar;
    use_null_word=usenull;
    
    srandom(100); //ensure repicable generation of random numbers
    bucket=BUCKET;
    threads=1;
}

cswam::~cswam() {
    
    assert(A==NULL);
    
    if (S){
        cerr << "Releasing memory of S\n";
        for (int e=0;e<trgdict->size();e++) delete [] S[e];
        delete S;
    }
    if (M){
        cerr << "Releasing memory of M\n";
        for (int e=0;e<trgdict->size();e++) delete [] M[e];
        delete [] M;
    }
    if (W2V){
        cerr << "Releasing memory of W2W\n";
        for (int f=0;f<srcdict->size();f++)
            if (W2V[f]!=NULL) delete [] W2V[f];
        delete [] W2V;
    }
    
    cerr << "Releasing memory of srcdict\n";
    delete srcdict;
    cerr << "Releasing memory of srcdict\n";
    delete trgdict;
    
    
}



void cswam::loadword2vec(char* fname){
    
    cerr << "Loading word2vec file " << fname << " ...\n";
    mfstream inp(fname,ios::in);
    
    long long w2vsize;
    inp >> w2vsize; cout << w2vsize << "\n";
    inp >> D ; cout << D  << "\n";
    
    W2V=new float* [srcdict->size()];
    for (int f=0;f<srcdict->size();f++) W2V[f]=NULL;
    
    char word[100]; int code; float dummy;
    int BoD=srcdict->encode(srcdict->BoD());
    int EoD=srcdict->encode(srcdict->BoD());
    
    for (long long i=0;i<w2vsize;i++){
        inp >> word;
        code=srcdict->encode(word);
        if (code != srcdict->oovcode()){
            W2V[code]=new float[D];
            for (int d=0;d<D;d++) inp >> W2V[code][d];
        }
        else //skip this word
            for (int d=0;d<D;d++) inp >> dummy;
        
        if (!(i % 100000)) cerr<< word << " ";
    }
    
    for (int f=0;f<srcdict->size();f++){
        if (W2V[f]==NULL && f!=BoD && f!=EoD) {
            cerr << "Missing src word in w2v: " << srcdict->decode(f) << "\n";
            W2V[f]=new float[D];
            for (int d=0;d<D;d++) W2V[f][d]=0;
        }
    }
    
    
    cerr << "\n";
    
    //normalized vector components
    float mean[D]; memset(mean,0,D*sizeof(float));
    float var[D]; memset(var,0,D*sizeof(float));
    
    for (code=0;code<srcdict->size();code++){
        
        if (!W2V[code]){
            cerr << "creating vector for word " << srcdict->decode(code) << "\n";
            W2V[code]=new float[D];
            for (int d=0;d<D;d++) W2V[code][d]=0; //to be worked out!
        }
        
        if (normalize_vectors || scale_vectors)
            for (int d=0;d<D;d++){
                mean[d]+=W2V[code][d];
                var[d]+=(W2V[code][d] * W2V[code][d]);
            }
    }
    
    if (normalize_vectors || scale_vectors){
        
        for (int d=0;d<D;d++){
            mean[d]/=srcdict->size();
            var[d]=var[d]/srcdict->size() - (mean[d]*mean[d]);
            cerr << d << " mean: " << mean[d] << "  sd: " << sqrt(var[d]) << "\n";
        }
        
        if (normalize_vectors){
            cerr << "Shifting vectors\n";
            for (code=0;code<srcdict->size();code++)
                for (int d=0;d<D;d++) W2V[code][d]=(W2V[code][d] - mean[d]);
        }
        if (normalize_vectors || scale_vectors){
            cerr << "Scaling vectors\n";
            for (code=0;code<srcdict->size();code++)
                for (int d=0;d<D;d++) W2V[code][d]=W2V[code][d]/sqrt(var[d]);
            
        }
    }
    cerr << " ... done\n";
};


void cswam::initModel(char* modelfile){
    
    //test if model is readable
    bool testmodel=false;
    FILE* f;if ((f=fopen(modelfile,"r"))!=NULL){fclose(f);testmodel=true;}
    
    if (testmodel) loadModel(modelfile,true); //we are in training mode!
    else{ //initialize model
        M=new float* [trgdict->size()];
        S=new float* [trgdict->size()];
        for (int e=0; e<trgdict->size(); e++){
            M[e]=new float [D];
            S[e]=new float [D];
            //initialize with w2v value if the same word is also in src
            int f=srcdict->encode(trgdict->decode(e));
            if (f!=srcdict->oovcode()){
                memcpy(M[e],W2V[f],sizeof(float) * D);
                for (int d=0;d<D;d++) S[e][d]=SSEED/4;
                cout << "initialize: " << srcdict->decode(f) << "\n";
            }else
                for (int d=0;d<D;d++){
                    M[e][d]=0.0; //pick mean zero
                    S[e][d]=SSEED; //take a wide standard deviation
                }
        }
    }
}

int cswam::saveModelTxt(char* fname){
    cerr << "Writing model into: " << fname << "\n";
    mfstream out(fname,ios::out);
    for (int e=0; e<trgdict->size(); e++){
        out << trgdict->decode(e) <<"\n";
        for (int d=0;d<D;d++) out << M[e][d] << " ";out << "\n";
        for (int d=0;d<D;d++) out << S[e][d] << " ";out << "\n";
    }
    return 1;
}

int cswam::saveModel(char* fname){
    cerr << "Saving model into: " << fname << " ...";
    mfstream out(fname,ios::out);
    out << "CSWAM " << D << "\n";
    trgdict->save(out);
    for (int e=0; e<trgdict->size(); e++){
        out.write((const char*)M[e],sizeof(float) * D);
        out.write((const char*)S[e],sizeof(float) * D);
    }
    out.close();
    cerr << "\n";
    return 1;
}

int cswam::loadModel(char* fname,bool expand){

    cerr << "Loading model from: " << fname << "...";
    mfstream inp(fname,ios::in);
    char header[100];
    inp.getline(header,100);
    cerr << header ;
    int r;
    sscanf(header,"CSWAM %d\n",&r);
    if (D>0 && r != D)
        exit_error(IRSTLM_ERROR_DATA, "incompatible dimension in model");
    else
        D=r;

    cerr << "\nLoading dictionary ... ";
    dictionary* dict=new dictionary(NULL,1000000);
    dict->load(inp);
    dict->encode(dict->OOV());
    int current_size=dict->size();
    
    //expand the model for training or keep the model fixed for testing
    if (expand){
        cerr << "\nExpanding model to include targer dictionary";
        dict->incflag(1);
        for (int code=0;code<trgdict->size();code++)
            dict->encode(trgdict->decode(code));
        dict->incflag(2);
    }
    //replace the trgdict with the model dictionary
    delete trgdict;trgdict=dict;
    
    M=new float* [trgdict->size()];
    S=new float* [trgdict->size()];
    for (int e=0; e<trgdict->size(); e++){
        M[e]=new float [D];
        S[e]=new float [D];
    }
    
    cerr << "\nReading parameters .... ";
    for (int e=0; e<current_size; e++){
        inp.read((char *)M[e],sizeof(float) * D);
        inp.read((char *)S[e],sizeof(float) * D);
    }
    inp.close();
    cerr << "\nInitializing " << trgdict->size()-current_size << " new entries .... ";
    for (int e=current_size; e<trgdict->size(); e++)
        for (int d=0;d<D;d++){
            M[e][d]=0.0;S[e][d]=SSEED;
        }
    
    cerr << "\nDone\n";
    return 1;
}

void cswam::initAlpha(){

    //install Alpha[s][i][j] to collect counts
    //allocate if empty
    
    if (A==NULL){
        assert(trgdata->numdoc()==srcdata->numdoc());
        A=new float**[trgdata->numdoc()];
        for (int s=0;s<trgdata->numdoc();s++){
            A[s]=new float *[trgdata->doclen(s)];
             for (int i=0;i<trgdata->doclen(s);i++)
                A[s][i]=new float [srcdata->doclen(s)];
        }
    }
    //initialize
    for (int s=0;s<trgdata->numdoc();s++){
        for (int i=0;i<trgdata->doclen(s);i++)
            memset(A[s][i],0,sizeof(float) * srcdata->doclen(s));
    }
}

void cswam::freeAlpha(){

    if (A!=NULL){
        for (int s=0;s<trgdata->numdoc();s++){
            for (int i=0;i<trgdata->doclen(s);i++)
                delete [] A[s][i];
            delete [] A[s];
        }
        delete [] A;
        A=NULL;
    }
}

///*****
pthread_mutex_t mut1;
pthread_mutex_t mut2;
double LL=0; //Log likelihood


float logsum(float a,float b){
    if (b<a) return a + log(1 + exp(b-a));
    else return b + log(1+ exp(a-b));
}


float cswam::LogGauss(const int dim,const float* x,const float *m, const float *s){
    
    static float log2pi=1.83787; //log(2 pi)
    float dist=0; float norm=0;

    for (int i=0;i<dim;i++){
        dist=(x[i]-m[i])*(x[i]-m[i])/(s[i]);
        norm+=s[i];
    }
    
    return -0.5 * (dist + dim * log2pi + logf(norm));
    
}
            
void cswam::expected_counts(void *argv){
    
    long long s=(long long) argv;
    
    if (! (s % 10000)) {cerr << ".";cerr.flush();}
    //fprintf(stderr,"Thread: %lu  sentence: %d  (out of %d)\n",(long)pthread_self(),s,srcdata->numdoc());
    
    int trglen=trgdata->doclen(s); // length of target sentence
    int srclen=srcdata->doclen(s); //length of source sentence
    
    float den;

    //reset likelihood
    localLL[s]=0;
    
    //compute denominator for each source-target pair
    for (int j=0;j<srclen;j++){
        //cout << "j: " << srcdict->decode(srcdata->docword(s,j)) << "\n";

          for (int i=0;i<trglen;i++){
            A[s][i][j]=LogGauss(D, W2V[srcdata->docword(s,j)],
                                M[trgdata->docword(s,i)],
                                S[trgdata->docword(s,i)]);
            
              if (i==0) den=A[s][i][j];
              else den=logsum(den,A[s][i][j]);
          }
        
        //update local likelihood
        localLL[s]+=den;
        
        for (int i=0;i<trglen;i++){
            assert(A[s][i][j]<= den);
            A[s][i][j]=expf(A[s][i][j]-den); // A is now a regular expected count
            
            if (A[s][i][j]<0.000000001) A[s][i][j]=0; //take mall risk of wrong normalization
            
            //            if (trgdata->docword(s,i)==trgdict->encode("documentos"))
            
//            cout << "Pr(" << trgdict->decode(trgdata->docword(s,i))
//            << " | "  << srcdict->decode(srcdata->docword(s,j))
//            << ") = " << A[s][i][j] << "\n";
        }
    }
}

void cswam::maximization(void *argv){
    
    long long d=(long long) argv;
    
    if (!(d  % 10)) cerr <<".";
    //Maximization step: Mean;
    for (int s=0;s<srcdata->numdoc();s++)
        for (int i=0;i<trgdata->doclen(s);i++)
            for (int j=0;j<srcdata->doclen(s);j++)
                if (A[s][i][j]>0)
                    M[trgdata->docword(s,i)][d]+=A[s][i][j] * W2V[srcdata->docword(s,j)][d];
    
    //second pass
    for (int e=0;e<trgdict->size();e++) if (Den[e]>0) M[e][d]/=Den[e];
    
    if (train_variances){
        //Maximization step: Variance;
        
        for (int s=0;s<srcdata->numdoc();s++)
            for (int i=0;i<trgdata->doclen(s);i++)
                for (int j=0;j<srcdata->doclen(s);j++)
                    if (A[s][i][j]>0)
                        S[trgdata->docword(s,i)][d]+=
                        (A[s][i][j] *
                         (W2V[srcdata->docword(s,j)][d]-M[trgdata->docword(s,i)][d]) *
                         (W2V[srcdata->docword(s,j)][d]-M[trgdata->docword(s,i)][d])
                         );
        
        //second pass
        for (int e=0;e<trgdict->size();e++){
            if (Den[e]>0){
                S[e][d]/=Den[e];
                if (S[e][d] < 0.01) S[e][d]=0.01;
            }
            else
                if (d==0) cout << "-\b";
        }
    }
}


int cswam::train(char *srctrainfile, char*trgtrainfile,char *modelfile, int maxiter,int threads){
    
   
    initModel(modelfile);

    //Load training data

    srcdata=new doc(srcdict,srctrainfile);
    trgdata=new doc(trgdict,trgtrainfile,use_null_word); //use null word

   
    int iter=0;
    
    cerr << "Starting training";
    threadpool thpool=thpool_init(threads);
    task *t=new task[srcdata->numdoc()];
    assert(srcdata->numdoc()>D); //multi-threading also distributed over D
    
   
    //support variable to compute model denominator
    Den=new float[trgdict->size()];
    //support variable to compute likelihood
    localLL=new float[srcdata->numdoc()];
    float LL;
    
    while (iter < maxiter){
        
        cerr << "\nIteration: " << ++iter <<  "\n";
        
        initAlpha();
        
        cerr << "E-step: ";
        //compute expected counts in each single sentence
        for (long long  s=0;s<srcdata->numdoc();s++){
            //prepare and assign tasks to threads
            t[s].ctx=this; t[s].argv=(void *)s;
            thpool_add_work(thpool, &cswam::expected_counts_helper, (void *)&t[s]);
            
        }
        //join all threads
        thpool_wait(thpool);
        
        
        //Prepare for model for update
        for (int e=0;e <trgdict->size();e++){
            memset(M[e],0,D * sizeof (float));
            if (train_variances)
                memset(S[e],0,D * sizeof (float)); //keep variance constant
        }
        
        memset(Den,0,trgdict->size() * sizeof(float));
        
        LL=0; //compute LL of current model
        //compute normalization term for each target word
        for (int s=0;s<srcdata->numdoc();s++){
            LL+=localLL[s];
            for (int i=0;i<trgdata->doclen(s);i++)
                for (int j=0;j<srcdata->doclen(s);j++)
                    Den[trgdata->docword(s,i)]+=A[s][i][j];
        }
        cerr << "LL = " << LL;
        cerr << "\nM-step: ";
        for (long long d=0;d<D;d++){
            t[d].ctx=this; t[d].argv=(void *)d;
            thpool_add_work(thpool, &cswam::maximization_helper, (void *)&t[d]);
        }
        
        //join all threads
        thpool_wait(thpool);
        
        
        
        if (srcdata->numdoc()> 10) system("date");
        
        saveModel(modelfile);
        saveModelTxt("modelfile.txt");
    }
    
   // for (int e=0;e<trgdict->size();e++)
   //     for (int d=0;d<D;d++)
   //         cout << trgdict->decode(e) << " S: " << S[e][d] << " M: " << M[e][d]<< "\n";

    //destroy thread pool
    thpool_destroy(thpool);
  
    freeAlpha();

    delete []Den;
    delete srcdata; delete trgdata;
    delete [] t; delete [] localLL;
    
    return 1;
}



void cswam::aligner(void *argv){
    long long s=(long long) argv;
    static float maxfloat=std::numeric_limits<float>::max();

    
    if (! (s % 10000)) {cerr << ".";cerr.flush();}
    //fprintf(stderr,"Thread: %lu  Document: %d  (out of %d)\n",(long)pthread_self(),s,srcdata->numdoc());
    
    int trglen=trgdata->doclen(s); // length of target sentence
    int srclen=srcdata->doclen(s); //length of source sentence
    
    assert(trglen<MAX_LINE);
    
    //Viterbi alignment: find the most probable alignment for source
    float score; float best_score;int best_j;
    for (int i=0;i<trglen;i++){
        best_score=-maxfloat;best_j=0;
        //cout << trgdict->decode(trgdata->docword(s,i)) << "\n";
        for (int j=0;j<srclen;j++){
            score=LogGauss(D,
                     W2V[srcdata->docword(s,j)],
                     M[trgdata->docword(s,i)],
                     S[trgdata->docword(s,i)]);
          //  cout << "\t " << srcdict->decode(srcdata->docword(s,j)) << "  " << dist << "\n";
            //if (dist > -50) score=(float)exp(-dist)/norm;
            if (score > best_score){
                best_score=score;
                best_j=j;
            }
        }
        alignments[s % bucket][i]=best_j;
    }
}




int cswam::test(char *srctestfile, char *trgtestfile, char* modelfile, char* alignfile,int threads){
    
    {mfstream out(alignfile,ios::out);} //empty the file
    
    initModel(modelfile);
    
    //Load training data
    srcdata=new doc(srcdict,srctestfile);
    trgdata=new doc(trgdict,trgtestfile);
    assert(srcdata->numdoc()==trgdata->numdoc());
    
   
    bucket=BUCKET; //initialize the bucket size
    
    alignments=new int* [BUCKET];
    for (int s=0;s<BUCKET;s++)
        alignments[s]=new int[MAX_LINE];
    
    threadpool thpool=thpool_init(threads);
    task *t=new task[bucket];
    
    cerr << "Start alignment\n";
    
    for (long long s=0;s<srcdata->numdoc();s++){
        
        t[s % bucket].ctx=this; t[s % bucket].argv=(void *)s;
        thpool_add_work(thpool, &cswam::aligner_helper, (void *)&t[s % bucket]);
        

        if (((s % bucket) == (bucket-1)) || (s==(srcdata->numdoc()-1)) ){
            //join all threads
            thpool_wait(thpool);
            
            //cerr << "Start printing\n";
            
            if ((s % bucket) != (bucket-1))
                    bucket=srcdata->numdoc() % bucket; //last bucket at end of file
            
                mfstream out(alignfile,ios::out | ios::app);
                
                for (int b=0;b<bucket;b++){ //includes the eof case of
                    out << "Sentence: " << s-bucket+1+b;
                    for (int i=0; i<trgdata->doclen(s-bucket+1+b); i++) out << " "  << i << "-" << alignments[b][i];
                    out << "\n";
                }
        }
        
    }
    
    
    //destroy thread pool
    thpool_destroy(thpool);
    
    delete [] t;
    for (int s=0;s<BUCKET;s++) delete [] alignments[s];delete [] alignments;
    delete srcdata; delete trgdata;
    return 1;
}