1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
|
/*
Copyright (c) 1998 - 2024
ILK - Tilburg University
CLST - Radboud University
CLiPS - University of Antwerp
This file is part of timbl
timbl is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
timbl is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, see <http://www.gnu.org/licenses/>.
For questions and suggestions, see:
https://github.com/LanguageMachines/timbl/issues
or send mail to:
lamasoftware (at ) science.ru.nl
*/
#ifndef TIMBL_FEATURES_H
#define TIMBL_FEATURES_H
#include <vector>
#include <map>
#include <unordered_map>
#include "timbl/MsgClass.h"
#include "timbl/Matrices.h"
#include "ticcutils/Unicode.h"
namespace Hash {
class UnicodeHash;
}
namespace Timbl {
class ValueClass;
class TargetValue;
class Targets;
class metricClass;
class SparseValueProbClass {
friend std::ostream& operator<< ( std::ostream&, SparseValueProbClass * );
public:
using IDmaptype = std::map< size_t, double >;
using IDiterator = IDmaptype::const_iterator;
explicit SparseValueProbClass( size_t d ): dimension(d) {};
void Assign( const size_t i, const double d ) { vc_map[i] = d; };
void Clear() { vc_map.clear(); };
IDiterator begin() const { return vc_map.begin(); };
IDiterator end() const { return vc_map.end(); };
private:
IDmaptype vc_map;
size_t dimension;
};
enum FeatVal_Stat {
Unknown,
Singleton,
SingletonNumeric,
NumericValue,
NotNumeric
};
class FeatureValue: public ValueClass {
friend class Feature;
friend class Feature_List;
friend struct D_D;
public:
explicit FeatureValue( const icu::UnicodeString& );
FeatureValue( const icu::UnicodeString&, size_t );
~FeatureValue() override;
void ReconstructDistribution( const ClassDistribution& vd ) {
TargetDist.Merge( vd );
_frequency = TargetDist.totalSize();
};
bool isUnknown() const { return _index == 0; };
SparseValueProbClass *valueClassProb() const { return ValueClassProb; };
private:
SparseValueProbClass *ValueClassProb;
ClassDistribution TargetDist;
};
class Feature: public MsgClass {
friend class MBLClass;
friend class Feature_List;
public:
explicit Feature( Hash::UnicodeHash *T );
~Feature() override;
bool Ignore() const { return ignore; };
void Ignore( const bool val ){ ignore = val; };
bool setMetricType( const MetricType );
MetricType getMetricType() const;
double Weight() const { return weight; };
void SetWeight( const double w ) { weight = w; };
double InfoGain() const { return info_gain; };
void InfoGain( const double w ){ info_gain = w; };
double SplitInfo() const { return split_info; };
void SplitInfo( const double w ){ split_info = w; };
double GainRatio() const { return gain_ratio; };
void GainRatio( const double w ){ gain_ratio = w; };
double ChiSquare() const { return chi_square; };
void ChiSquare( const double w ){ chi_square = w; };
double SharedVariance() const { return shared_variance; };
void SharedVariance( const double w ){ shared_variance = w; };
double StandardDeviation() const { return standard_deviation; };
void StandardDeviation( const double w ){ standard_deviation = w; };
double Min() const { return n_min; };
void Min( const double val ){ n_min = val; };
double Max() const { return n_max; };
void Max( const double val ){ n_max = val; };
double fvDistance( const FeatureValue *,
const FeatureValue *,
size_t=1 ) const;
FeatureValue *add_value( const icu::UnicodeString&, TargetValue *, int=1 );
FeatureValue *add_value( size_t, TargetValue *, int=1 );
FeatureValue *Lookup( const icu::UnicodeString& ) const;
bool decrement_value( FeatureValue *, const TargetValue * );
bool increment_value( FeatureValue *, const TargetValue * );
size_t EffectiveValues() const;
size_t TotalValues() const;
bool isNumerical() const;
bool isStorableMetric() const;
bool AllocSparseArrays( size_t );
void InitSparseArrays();
bool ArrayRead(){ return vcpb_read; };
bool matrixPresent( bool& ) const;
size_t matrix_byte_size() const;
bool store_matrix( int = 1 );
void clear_matrix();
bool fill_matrix( std::istream& );
void print_matrix( std::ostream&, bool = false ) const;
void print_vc_pb_array( std::ostream& ) const;
bool read_vc_pb_array( std::istream & );
FeatVal_Stat prepare_numeric_stats();
void Statistics( double, const Targets&, bool );
void NumStatistics( double, const Targets&, int, bool );
void ClipFreq( size_t f ){ matrix_clip_freq = f; };
size_t ClipFreq() const { return matrix_clip_freq; };
SparseSymetricMatrix<const ValueClass *> *metric_matrix;
private:
Feature( const Feature& );
Feature& operator=( const Feature& );
Hash::UnicodeHash *TokenTree;
metricClass *metric;
bool ignore;
bool numeric;
bool vcpb_read;
enum ps_stat{ ps_undef, ps_failed, ps_ok, ps_read };
enum ps_stat PrestoreStatus;
MetricType Prestored_metric;
void delete_matrix();
double entropy;
double info_gain;
double split_info;
double gain_ratio;
double chi_square;
double shared_variance;
double standard_deviation;
size_t matrix_clip_freq;
std::vector<long int> n_dot_j;
std::vector<long int> n_i_dot;
double n_min;
double n_max;
double weight;
void Statistics( double );
void NumStatistics( std::vector<FeatureValue *>&, double );
void ChiSquareStatistics( const std::vector<FeatureValue *>&,
const Targets& );
void ChiSquareStatistics( const Targets& );
void SharedVarianceStatistics( const Targets&, int );
void StandardDeviationStatistics();
std::vector<FeatureValue *> values_array;
std::unordered_map< size_t, FeatureValue *> reverse_values;
bool is_reference;
};
class Feature_List: public MsgClass {
friend class MBLClass;
public:
Feature_List():
_eff_feats(0),
_num_of_feats(0),
_num_of_num_feats(0),
_feature_hash(0),
_is_reference(false)
{
}
explicit Feature_List( Hash::UnicodeHash *hash ):
Feature_List()
{
_feature_hash = hash;
}
Feature_List &operator=( const Feature_List& );
~Feature_List() override;
void init( size_t, const std::vector<MetricType>& );
Hash::UnicodeHash *hash() const { return _feature_hash; };
size_t effective_feats(){ return _eff_feats; };
Feature *operator[]( size_t i ) const { return feats[i]; };
void write_permutation( std::ostream & ) const;
void calculate_permutation( const std::vector<double>& );
size_t _eff_feats;
size_t _num_of_feats;
size_t _num_of_num_feats;
std::vector<Feature *> feats;
std::vector<Feature *> perm_feats;
std::vector<size_t> permutation;
private:
Hash::UnicodeHash *_feature_hash;
bool _is_reference;
};
} // namespace Timbl
#endif // TIMBL_FEATURES_H
|