[go: up one dir, main page]

Menu

[653d84]: / src / main.cpp  Maximize  Restore  History

Download this file

664 lines (625 with data), 22.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
////////////////////////////////////////////////////////////////////////////////
// Daniel Smith
// dsmith@ics.mq.edu.au
////////////////////////////////////////////////////////////////////////////////
// Program to test Named Entity Recognition
// For training and testing.
// Run with parameters:
// e.g. afner --train-mode -M modelL235.mdl -T ./RemediaAnnot/level2/ -T ./RemediaAnnot/level3/ -T ./RemediaAnnot/level5/ -D d:/afner/resultL2345.dmp
// or afner --run-mode -M data/model-RC-L235.mdl -R RemediaPlain/level4 -O results/
////////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2008 Diego Molla-Aliod <diego@ics.mq.edu.au>
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
////////////////////////////////////////////////////////////////////////////////
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <vector>
#include <map>
#include <dirent.h>
#include <boost/regex.hpp>
#include <boost/program_options.hpp>
#include <boost/filesystem/path.hpp>
#include <boost/filesystem/operations.hpp>
#include "tokeniser.h"
#include "ner.h"
#include "entity_tag.h"
#include "regex_handler.h"
#include "list_handler.h"
using namespace std;
using namespace AF;
using namespace boost::program_options;
namespace fs=boost::filesystem;
StringXML readfile(const StringXML& fname);
void list_directory(const StringXML& path, vector<StringXML>& flisting,
vector<StringXML>& dirlisting);
void test(const vector<StringXML>& files,
const vector<StringXML>& dirs,NEDeco deco,
const StringXML& format,const StringXML resultsDir, float threshold);
void testDirectory(const StringXML& path,NEDeco deco,const StringXML format,
const StringXML resultsDir,float threshold);
void testFile(const StringXML& path,const StringXML& fname,
NEDeco deco,const StringXML format,
const StringXML resultsDir, float threshold);
void train(const StringXML& outputFile, const vector<StringXML>& files,
const vector<StringXML>& dirs,NEDeco deco,const int classCount,
const StringXML& freqFile,const StringXML& prevFreqFile);
void trainDirectory(const StringXML& path,ostream& out,
NEDeco deco,vector<map<StringXML,int> >& frequencies,
vector<map<StringXML,int> >& prevFrequencies,
const bool countFrequencies);
void trainFile(const StringXML& path,ostream& out,
NEDeco deco, vector<map<StringXML,int> >& frequencies,
vector<map<StringXML,int> >& prevFrequencies,
const bool countFrequencies);
void setFeatureWeight(const StringXML& setting, FeatureHandler& fh);
map<StringXML,EntityTag>
InitializeListHandler(const StringXML& file);
// A helper function to simplify the main part.
template<class T>
ostream& operator<<(ostream& os, const vector<T>& v) {
copy(v.begin(), v.end(), ostream_iterator<T>(cout, " "));
return os;
}
int main(int argc, char* argv[]) {
vector<StringXML> files;
vector<StringXML> dirs;
vector<StringXML> feature_weight;
StringXML mode = "";
StringXML listfiles = "";
StringXML trainDataFile="";
StringXML modelFile="";
StringXML regex_loc="";
StringXML feature_regex_file="";
StringXML outputLocation="";
StringXML freq="";
StringXML prevFreq="";
StringXML freqIn="";
StringXML prevFreqIn="";
int context=0;
StringXML format="NORMAL";
int maxLabels=1;
StringXML configFile="";
bool dotest=false;
bool dotrain=false;
bool docount=false;
bool dodump=false;
bool singleLabels = false;
bool insufParam = false;
StringXML tagsetLoc = "";
float threshold;
float default_weight;
try {
// options only for command line
options_description command("Command line only arguments");
command.add_options()
("help,h","produce this help message")
("config-file,c",value<StringXML>(&configFile),
"file containing configuration settings")
;
// options for counting tokens
options_description counting("Settings for counting tokens");
counting.add_options()
("count", "Set mode for counting")
("token-frequency-output",value<StringXML>(&freq),
"file to dump token frequencies to")
("prev-token-frequency-output",value<StringXML>(&prevFreq),
"file to dump previous token frequencies to")
;
// options for training
options_description training("Training settings");
training.add_options()
("train", "Set mode for training")
("output-model-file",value<StringXML>(&modelFile),
"model file to generate")
;
// options for running
options_description running("Running settings");
running.add_options()
("run", "Set mode for running (testing)")
("model-file,M",value<StringXML>(&modelFile)->default_value("config/BBN.mdl"),
"model file to read")
("format,F",value<StringXML>(&format),
"specify the format of the"
"output as either NORMAL or SHORT")
("max-labels,L",value<int>(&maxLabels)->default_value(3),
"maximum labels to assign to a"
"token")
("single,S","allow only a single tag per"
"token (Default is multiple tags)")
("threshold,e",value<float>(&threshold)->default_value(0.0),
"threshold above which a NE will be issued")
("output-path,O",
value<StringXML>(&outputLocation)->default_value("afner-output"),
"path to dump output from NER runs")
;
// options for dumping NER input data
options_description dumping("Data dumping settings");
dumping.add_options()
("dump", "Set mode for generating Yasmet input data only")
;
// common options
options_description config("Common options");
config.add_options()
("path,P",value< vector<StringXML> > (&dirs)->composing(),
"directory of files")
("file,f",value< vector<StringXML> >(&files)->composing(),
"individual file")
("training-data-file,D",value<StringXML>(&trainDataFile),
"file for dumping training data for YASMET")
("context,C",value<int>(&context)->default_value(2),
"range of contextual features used")
("entity_list",value<StringXML>(&listfiles)->default_value("config/list_spec"),
"file containing location of entity lists paired with tags")
("tagset,g",value<StringXML>(&tagsetLoc)->default_value("config/BBN_tags"),
"location of file storing tagset information")
("regex-file,x",value<StringXML>(&regex_loc)->default_value("config/regex"),
"location of broad regular expression specification")
("feature-regex-file,y",value<StringXML>(&feature_regex_file)->default_value("config/feature_regex"),
"location of regular expressions to be used only as features")
("feature-weight",value< vector<StringXML> >(&feature_weight)->
composing()->multitoken(),
"sets the weight of a feature i.e. feature1 0")
("default-feature-weight",
value<float>(&default_weight)->default_value(1.0),
"sets the default weight of features")
("token-frequency-input", value<StringXML>(&freqIn),
"the location of token frequency information generated in training")
("prev-token-frequency-input", value<StringXML>(&prevFreqIn),
"the location of previous token frequency information generated in"
" training")
;
options_description cmdline_options;
cmdline_options.add(command).add(config).add(running).add(training).add(dumping).add(counting);
options_description config_file_options;
config_file_options.add(config).add(running).add(training).add(dumping).add(counting);
positional_options_description p;
p.add("run-file",-1);
variables_map vm;
store(command_line_parser(argc,argv).
options(cmdline_options).positional(p).run(),vm);
notify(vm);
ifstream ifs(configFile.c_str());
store(parse_config_file(ifs,config_file_options),vm);
notify(vm);
// if help option selected, print help
if (vm.count("help")) {
cmdline_options.print(cout);
cout << endl << "Configuration options can be set either at "
<< "command line or in the configuration \nfile." << endl << endl;
return 1;
}
// work out from passed parameters whether to test, train or count
int countModes = 0;
if (vm.count("count")) {
docount = true;
countModes += 1;
cout << "Counting parameters detected" << endl;
}
if (vm.count("train")) {
dotrain = true;
countModes += 1;
cout << "Training parameters detected" << endl;
}
if (vm.count("run")) {
dotest = true;
countModes += 1;
cout << "Testing parameters detected." << endl;
}
if (vm.count("dump")) {
dodump = true;
countModes += 1;
cout << "Dumping parameters detected." << endl;
}
if (countModes > 1) {
cout << "Only one mode (count, train, test, dump is allowed)." << endl;
insufParam = true;
}
if (countModes == 0) {
dotest = true;
cout << "No mode selected; defaulting to --run (testing parameters)"
<< endl;
}
if (vm.count("path") + vm.count("file") == 0) {
cout << "You must specify at least a directory (-P) and/or a file (-f)."
<< endl;
insufParam = true;
}
if ((dotrain || dodump) && vm.count("training-data-file") == 0) {
cout << "You must specify a training data file (-D)."
<< endl;
insufParam = true;
}
if (dotrain && vm.count("output-model-file") == 0) {
cout << "You must provide a model file in the training mode." << endl
<< "Use --dump instead of --train if you only want to produce the "
<< "Yasmet input data."
<< endl;
insufParam = true;
}
if ((dotrain || dotest || dodump) && vm.count("tagset") == 0) {
cout << "You must specify a tagset (-g)."
<< endl;
insufParam = true;
}
if (insufParam) {
cout << "Run with --help for options."
<< endl;
return 1;
}
// initialise tagset
ifstream inFile;
inFile.open(tagsetLoc.c_str());
if (!inFile) {
cout << "Unable to open tagset data file " << tagsetLoc << endl;
return 1;
}
EntityTagset t(inFile);
inFile.close();
cout << endl << "Using tags:" << endl;
t.printTags(cout);
cout << endl;
RegexHandler rh;
if (regex_loc!="") {
ifstream inFile;
inFile.open(regex_loc.c_str());
if (!inFile) {
cout << "Unable to open regex data file " << regex_loc << endl;
}
RegexHandler r(inFile);
rh = r;
inFile.close();
}
if (listfiles=="") {
cout << "Warning: no lists specified." << endl;
}
// if lists have been specified
ListHandler lh(InitializeListHandler(listfiles));
// create the feature handler
FeatureHandler fh(lh,rh,t,feature_regex_file,context,
default_weight,freqIn,prevFreqIn);
// set custom feature weights
if (vm.count("feature-weight")) {
for (vector<StringXML>::const_iterator i=feature_weight.begin();
i!=feature_weight.end();i++) {
setFeatureWeight(*i,fh);
}
}
cout << endl;
fh.printFeatures(cout);
cout << endl;
// create the decorator object
NEDeco deco(&t,&rh,&lh,&fh,modelFile,maxLabels,context,singleLabels);
if (docount) {
cout << "COUNTING" << endl << endl;
train(trainDataFile,files,dirs,deco,t.classCount(),freq,
prevFreq);
}
if (dotrain) {
cout << "TRAINING" << endl << endl;
train(trainDataFile,files,dirs,deco,t.classCount(),freq,
prevFreq);
// if a model file is given, create the model file directly
if (vm.count("model-file")) {
cout << "Generating model file. This may take some time..."
<< endl << endl;
stringstream ss;
ss << "cat " << trainDataFile << " | " << "../yasmet/yasmet > "
<< modelFile;
StringXML cmd = ss.str();
std::system(cmd.c_str());
}
}
if (dodump) {
cout << "DUMPING" << endl << endl;
train(trainDataFile,files,dirs,deco,t.classCount(),freq,
prevFreq);
}
if (dotest) {
cout << "TESTING" << endl;
test(files,dirs,deco,format,outputLocation,threshold);
}
}
catch(std::exception& e) {
cerr << "error: " << e.what() << endl;
return 1;
}
catch(...) {
cerr << "Unknown exception." << endl;
}
return EXIT_SUCCESS;
}
void testDirectory(const StringXML& path,NEDeco deco,
const StringXML format,
const StringXML resultsDir,
float threshold){
vector<StringXML> flist;
vector<StringXML> dirlist;
list_directory(path,flist,dirlist);
cout << "Running directory: " << path << endl;
cout << "Output to: " << resultsDir << endl << endl;
for (unsigned int i=0;i<flist.size();i++){
//cout << "Testing File " << path << flist[i] << " " << resultsDir << endl;
testFile(path,flist[i],deco,format,resultsDir,threshold);
cout << i + 1 << " of " << flist.size() << " files done" << endl << endl;
}
for (unsigned int i=0;i<dirlist.size();i++) {
StringXML newpath = path + "/" + dirlist[i];
StringXML newres = resultsDir + "/" + dirlist[i];
//cout << "testDirectory " << newpath << " " << newres << endl;
testDirectory(newpath,deco,format,newres,threshold);
cout << i + 1 << " of " << dirlist.size()
<< " directories done" << endl << endl;
}
}
void trainDirectory(const StringXML& path,ostream& out,NEDeco deco,
vector<map<StringXML,int> >& frequencies,
vector<map<StringXML,int> >& prevFrequencies,
const bool countFrequencies){
vector<StringXML> flist;
vector<StringXML> dirlist;
list_directory(path,flist,dirlist);
cout << path << endl;
for (unsigned int i=0;i<flist.size();i++) {
cout << '_' << flush;
}
cout << endl;
for (unsigned int i=0;i<flist.size();i++){
StringXML full = path + "/" + flist[i];
trainFile(full,out,deco,frequencies,prevFrequencies,countFrequencies);
cout << '|' << flush;
}
cout << endl << endl;
}
// given a model file, do the Entity checking
void testFile(const StringXML& path,const StringXML& fname,
NEDeco deco,const StringXML format,
const StringXML resultsDir, float threshold) {
StringXML fullname = path + "/" + fname;
StringXML simpFilename=fname;
if (path==".") {
fullname = fname;
int slashpos=fname.rfind("/",fname.size()-1);
simpFilename=fname.substr(slashpos,fname.size());
}
StringXML text = readfile(fullname);
StringXML resultfile = resultsDir + "/" + simpFilename;
cout << "\tRunning file: " << fullname << endl;
cout << "\tOutput file: " << resultfile << endl;
//NEDeco deco(text,t,rh,modelFile,maxLabels,context,singleLabels);
deco.Decorate(&text);
ofstream outfile;
outfile.open(resultfile.c_str());
if (outfile.is_open()) {
if (format=="SHORT")
deco.printTRECEnts(outfile,threshold);
else
deco.printEnts(outfile,threshold);
outfile.close();
}
else { // if file not open, try making directory then file
StringXML x = "mkdir " + resultsDir + " -p";
std::system(x.c_str());
outfile.open(resultfile.c_str());
if (outfile.is_open()) {
if (format=="SHORT") {
deco.printTRECEnts(outfile,threshold);
}
else {
deco.printEnts(outfile,threshold);
}
outfile.close();
}
else {
cout << "Unable to open " << resultfile << endl;
}
}
//deco.printEnts();
}
void trainFile(const StringXML& path, ostream& out,NEDeco deco,
vector<map<StringXML,int> >& frequencies,
vector<map<StringXML,int> >& prevFrequencies,
const bool countFrequencies){
StringXML text(readfile(path));
deco.PrintTrainingData(&text,false,out,frequencies,prevFrequencies,
countFrequencies);
}
void test(const vector<StringXML>& files,
const vector<StringXML>& dirs,NEDeco deco,
const StringXML& format,const StringXML resultsDir, float threshold) {
cout << files.size() << " files, " << dirs.size() << " directories." << endl;
cout << "Files: " << endl;
for (unsigned int i=0;i<files.size();i++) {
cout << '_' << flush;
}
cout << endl;
for (vector<StringXML>::const_iterator ite=files.begin();
ite!=files.end();ite++) {
testFile(".",*ite,deco,format,resultsDir,threshold);
cout << '|' << flush;
}
cout << endl << "Directories:" << endl;
for (vector<StringXML>::const_iterator ite=dirs.begin();
ite!=dirs.end();ite++) {
testDirectory(*ite,deco,format,resultsDir,threshold);
}
}
void train(const StringXML& outputFile, const vector<StringXML>& files,
const vector<StringXML>& dirs,NEDeco deco,const int classCount,
const StringXML& freqFile,const StringXML& prevFreqFile) {
ofstream out;
out.open(outputFile.c_str());
out << classCount << endl;
bool countFrequencies=false;
// check whether to count frequencies
if (freqFile!="" || prevFreqFile!="") {
cout << "Counting token frequencies." << endl;
countFrequencies=true;
if (freqFile=="") {
cerr << "Error: output file for frequency counts not specified" << endl;
return;
}
if (prevFreqFile=="") {
cerr << "Error: output file for previous frequency counts not specified"
<< endl;
return;
}
}
vector<map<StringXML,int> > frequencies;
vector<map<StringXML,int> > prevFrequencies;
for (int i=0;i<classCount;i++) {
map<StringXML,int> m;
map<StringXML,int> pm;
frequencies.push_back(m);
prevFrequencies.push_back(pm);
}
cout << files.size() << " files, " << dirs.size() << " directories." << endl;
cout << "Files: " << endl;
for (unsigned int i=0;i<files.size();i++) {
cout << '_' << flush;
}
cout << endl;
// initialise empty maps for each tag
//for (int i=0;i<(classCount/2)-1;i++) {
//}
for (vector<StringXML>::const_iterator ite=files.begin();
ite!=files.end();ite++) {
trainFile(*ite,out,deco,frequencies,prevFrequencies,countFrequencies);
cout << '|' << flush;
}
cout << endl << "Directories:" << endl;
for (vector<StringXML>::const_iterator ite=dirs.begin();
ite!=dirs.end();ite++) {
trainDirectory(*ite,out,deco,frequencies,prevFrequencies,countFrequencies);
}
out.close();
// dump the frequency information to file
if (freqFile!="") {
out.open(freqFile.c_str());
// for each tag
for (unsigned int j=0;j<frequencies.size();j++) {
out << "#### Tag " << j << " --------------------" << endl;
// print each token
for (map<StringXML,int>::const_iterator ite=frequencies[j].begin();
ite!=frequencies[j].end();ite++) {
out << ite->first << " " << ite-> second << endl;
}
out << endl;
}
out.close();
}
if (prevFreqFile!="") {
out.open(prevFreqFile.c_str());
// for each tag
for (unsigned int j=0;j<prevFrequencies.size();j++) {
out << "#### Tag " << j << " --------------------" << endl;
// print each token
for (map<StringXML,int>::const_iterator ite=prevFrequencies[j].begin();
ite!=prevFrequencies[j].end();ite++) {
out << ite->first << " " << ite-> second << endl;
}
out << endl;
}
out.close();
}
}
void setFeatureWeight(const StringXML& setting, FeatureHandler& fh) {
try {
// get the string up until the space
istringstream is(setting);
StringXML feature = "";
double weight=0.0;
is >> feature >> weight;
if (!fh.setFeatureWeight(feature,weight)) {
cout << "Warning: Error setting feature weight: " << setting << endl;
}
} catch(...) {
cerr << "Error applying setting: " << setting << endl;
}
}
////////////////////////////////////////////////////////////////////////////////
// Puts the filenames in a given directory in the given vector
////////////////////////////////////////////////////////////////////////////////
void list_directory(const StringXML& path, vector<StringXML>& flisting,
vector<StringXML>& dirlisting) {
fs::path dir_path(path);
if (!fs::exists(dir_path)) {
cerr << "Can't list directory: " << path << endl;
}
fs::directory_iterator end_itr; // default construction yields past-the-end
for (fs::directory_iterator itr(dir_path); itr!=end_itr; ++itr) {
if (fs::is_directory(*itr)) { // dir
dirlisting.push_back(itr->path().filename().string());
}
else { // file
flisting.push_back((dir_path/itr->path().filename()).leaf().string());
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Initializes the list handler, given the file that contains the list specification
////////////////////////////////////////////////////////////////////////////////
map<StringXML,EntityTag>
InitializeListHandler(const StringXML& file) {
map<StringXML,EntityTag> listMap;
StringXML list="";
bool listLine=true;
ifstream ifs;
ifs.open(file.c_str());
if (!ifs) {
cout << "Error opening list specification file." << endl;
}
// for each line
StringXML line="";
while (getline(ifs,line)) {
// if line is not a comment and is not
if ((line.size() > 0) && (line.substr(0,1) != "#")) {
if (listLine) {
list=line;
listLine=false;
}
else {
cout << "Using list: " << list << "\t" << line << endl;
EntityTag t(line);
listMap.insert(make_pair(list,t));
list="";
listLine=true;
}
}
}
ifs.close();
return listMap;
}
////////////////////////////////////////////////////////////////////////////////
// Reads and returns the text of a file
////////////////////////////////////////////////////////////////////////////////
StringXML readfile(const StringXML& fname) {
ifstream inFile;
StringXML out="";
inFile.open(fname.c_str());
if (!inFile) {
cout << "Unable to open file: " << fname << endl;
}
else {
while (!inFile.eof()) {
StringXML str;
getline(inFile,str);
out = out + "\n" + str;
}
}
inFile.close();
return out;
}