[go: up one dir, main page]

File: rclaspell.cpp

package info (click to toggle)
recoll 1.43.0-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 16,400 kB
  • sloc: cpp: 103,890; python: 9,349; xml: 7,305; ansic: 6,447; sh: 1,212; perl: 130; makefile: 72
file content (375 lines) | stat: -rw-r--r-- 12,199 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
/* Copyright (C) 2006-2021 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
#include "autoconfig.h"

#ifdef RCL_USE_ASPELL

#include "rclaspell.h"

#include <mutex>
#include <vector>
#include <string>
#include <algorithm>
#include <stdlib.h>

#include "pathut.h"
#include "execmd.h"
#include "log.h"
#include "unacpp.h"
#include "rclutil.h"
#include "smallut.h"

using namespace std;

// Private rclaspell data
class AspellData {
public:
    string m_execbuild;
    vector<string> m_execspell;
    ExecCmd m_speller;
#ifdef _WIN32
    string m_datadir;
#endif
    string m_addCreateParam;
};

Aspell::Aspell(const RclConfig *cnf)
    : m_config(cnf)
{
}

Aspell::~Aspell()
{
    deleteZ(m_data);
}

bool Aspell::init(string &reason)
{
    deleteZ(m_data);

    // Language: we get this from the configuration, else from the NLS
    // environment. The aspell language names used for selecting language 
    // definition files (used to create dictionaries) are like en, fr
    if (!m_config->getConfParam("aspellLanguage", m_lang) || m_lang.empty()) {
        string lang = "en";
        const char *cp;
        if ((cp = getenv("LC_ALL")))
            lang = cp;
        else if ((cp = getenv("LANG")))
            lang = cp;
        if (!lang.compare("C"))
            lang = "en";
        m_lang = lang.substr(0, lang.find_first_of("_"));
        if (!m_lang.compare("ja")) {
            // Aspell has no support for Japanese. We substitute
            // english, as Japanese users often have texts with
            // interspersed english words or english texts. Japanese
            // parts of the text won't be sent to aspell (check
            // Rcl::Db::isSpellingCandidate())
            m_lang = "en";
        }
    }

    m_data = new AspellData;

    m_config->getConfParam("aspellAddCreateParam", m_data->m_addCreateParam);
#ifdef _WIN32
    m_data->m_datadir = path_cat(
        path_pkgdatadir(), "filters/aspell-installed/mingw32/lib/aspell-0.60");
    if (m_data->m_addCreateParam.empty()) {
        m_data->m_addCreateParam =
            string("--local-data-dir=") + path_cat(m_config->getConfDir(), "aspell");
    }
#endif // WIN32
    
    const char *aspell_prog_from_env = getenv("ASPELL_PROG");
    if (aspell_prog_from_env && path_access(aspell_prog_from_env, X_OK) == 0) {
        m_data->m_execbuild = aspell_prog_from_env;
    }
#ifdef ASPELL_PROG
    if (m_data->m_execbuild.empty()) {
        string cmd = m_config->findFilter(ASPELL_PROG);
        LOGDEB("rclaspell::init: findFilter returns " << cmd << endl);
        if (path_isabsolute(cmd)) {
            m_data->m_execbuild.swap(cmd);
        }
    }
#endif // ASPELL_PROG
    if (m_data->m_execbuild.empty()) {
        ExecCmd::which("aspell", m_data->m_execbuild);
    }
    if (m_data->m_execbuild.empty()) {
        reason = "aspell program not found or not executable";
        deleteZ(m_data);
        return false;
    }

    // At the moment, no idea how to distribute the pyaspell Python extension on MacOS.
    // So Recoll still makes suggestions (using the aspell command) when a search fails,
    // but can't widen searches to common orthographic neighbours.
    // On other systems, we could fall back on the aspell cmd if the python module fails to load,
    // but we don't at the moment: it's rclaspell-sugg or nothing
#ifndef __APPLE__
    m_data->m_execspell = {
        "rclaspell-sugg.py", 
        string("--lang=") + m_lang,
        "--encoding=utf-8",
#ifdef _WIN32
        string("--data-dir=") + m_data->m_datadir,
#endif
        string("--master=") + dicPath(),
        "--sug-mode=fast",
        "--mode=none",
    };
    if (!m_data->m_addCreateParam.empty()) {
        m_data->m_execspell.push_back(m_data->m_addCreateParam);
    }
    m_data->m_execspell.push_back("pipe");
    m_config->processFilterCmd(m_data->m_execspell);
#else // __APPLE__ ->
    m_data->m_execspell = {
        m_data->m_execbuild,
        string("--lang=") + m_lang,
        "--encoding=utf-8",
        string("--master=") + dicPath(),
        "--sug-mode=fast",
        "--mode=none",
    };
    if (!m_data->m_addCreateParam.empty()) {
        m_data->m_execspell.push_back(m_data->m_addCreateParam);
    }
    m_data->m_execspell.push_back("pipe");
#endif
    return true;
}


bool Aspell::ok() const
{
    return nullptr != m_data;
}


string Aspell::dicPath()
{
    string ccdir = m_config->getAspellcacheDir();
    return path_cat(ccdir, string("aspdict.") + m_lang + string(".rws"));
}


// The data source for the create dictionary aspell command. We walk
// the term list, filtering out things that are probably not words.
// Note that the manual for the current version (0.60) of aspell
// states that utf-8 is not well supported, so that we should maybe
// also filter all 8bit chars. Info is contradictory, so we only
// filter out CJK which is definitely not supported (katakana would
// make sense though, but currently no support).
class AspExecPv : public ExecCmdProvide {
public:
    string *m_input; // pointer to string used as input buffer to command
    Rcl::TermIter *m_tit;
    Rcl::Db &m_db;
    AspExecPv(string *i, Rcl::TermIter *tit, Rcl::Db &db) 
        : m_input(i), m_tit(tit), m_db(db)
        {}
    void newData() {
        while (m_db.termWalkNext(m_tit, *m_input)) {
            LOGDEB2("Aspell::buildDict: term: ["  << (m_input) << "]\n" );
            if (!Rcl::Db::isSpellingCandidate(*m_input)) {
                LOGDEB2("Aspell::buildDict: SKIP\n" );
                continue;
            }
            if (!o_index_stripchars) {
                string lower;
                if (!unacmaybefold(*m_input, lower, UNACOP_FOLD))
                    continue;
                m_input->swap(lower);
            }
            // Got a non-empty sort-of appropriate term, let's send it to
            // aspell
            LOGDEB2("Apell::buildDict: SEND\n" );
            m_input->append("\n");
            return;
        }
        // End of data. Tell so. Exec will close cmd.
        m_input->erase();
    }
};


bool Aspell::buildDict(Rcl::Db &db, string &reason)
{
    if (!ok())
        return false;

    // We create the dictionary by executing the aspell command:
    // aspell --lang=[lang] create master [dictApath]
    string cmdstring(m_data->m_execbuild);
    ExecCmd aspell;
    vector<string> args;
    args.push_back(string("--lang=")+ m_lang);
    cmdstring += string(" ") + string("--lang=") + m_lang;
    args.push_back("--encoding=utf-8");
    cmdstring += string(" ") + "--encoding=utf-8";
#ifdef _WIN32
    args.push_back(string("--data-dir=") + m_data->m_datadir);
#endif
    if (!m_data->m_addCreateParam.empty()) {
        args.push_back(m_data->m_addCreateParam);
        cmdstring += string(" ") + m_data->m_addCreateParam;
    }
    args.push_back("create");
    cmdstring += string(" ") + "create";
    args.push_back("master");
    cmdstring += string(" ") + "master";
    args.push_back(dicPath());
    cmdstring += string(" ") + dicPath();

    // Have to disable stderr, as numerous messages about bad strings are
    // printed. We'd like to keep errors about missing databases though, so
    // make it configurable for diags
    bool keepStderr = false;
    m_config->getConfParam("aspellKeepStderr", &keepStderr);
    if (!keepStderr)
        aspell.setStderr("/dev/null");

    Rcl::TermIter *tit = db.termWalkOpen();
    if (nullptr == tit) {
        reason = "termWalkOpen failed\n";
        return false;
    }
    string termbuf;
    AspExecPv pv(&termbuf, tit, db);
    aspell.setProvide(&pv);
    
    if (aspell.doexec(m_data->m_execbuild, args, &termbuf)) {
        ExecCmd cmd;
        args.clear();
        args.push_back("dicts");
        string dicts;
        bool hasdict = false;
        if (cmd.doexec(m_data->m_execbuild, args, nullptr, &dicts)) {
            vector<string> vdicts;
            stringToTokens(dicts, vdicts, "\n\r\t ");
            if (find(vdicts.begin(), vdicts.end(), m_lang) != vdicts.end()) {
                hasdict = true;
            }
        }
        if (hasdict) {
            reason = string("\naspell dictionary creation command [") +
                cmdstring;
            reason += string(
                "] failed. Reason unknown.\n"
                "Try to set aspellKeepStderr = 1 in recoll.conf, and execute \n"
                "the indexing command in a terminal to see the aspell "
                "diagnostic output.\n");
        } else {
            reason = string("aspell dictionary creation command failed:\n") +
                cmdstring + "\n"
                "One possible reason might be missing language "
                "data files for lang = " + m_lang +
                ". Maybe try to execute the command on a terminal command line "
                "for a better diagnostic.";
        }
        return false;
    }
    db.termWalkClose(tit);
    return true;
}

bool Aspell::make_speller(string& reason)
{
    if (!ok())
        return false;
    if (m_data->m_speller.getChildPid() > 0)
        return true;

    LOGDEB("Starting aspell command [" << stringsToString(m_data->m_execspell) << "]\n");
    if (m_data->m_speller.startExec(m_data->m_execspell, true, true) != 0) {
        reason += "Can't start aspell: " + stringsToString(m_data->m_execspell);
        return false;
    }
    // Read initial line from aspell: version etc.
    string line;
    if (m_data->m_speller.getline(line, 2) <= 0) {
        reason += "Aspell: failed reading initial line";
        m_data->m_speller.zapChild();
        return false;
    }
    LOGDEB("rclaspell: aspell initial answer: [" << line << "]\n");
    return true;
}

bool Aspell::suggest(
    Rcl::Db &db, const string &_term, vector<string>& suggestions, string& reason)
{
    LOGDEB("Aspell::suggest: term [" << _term << "]\n");
    if (!ok() || !make_speller(reason))
        return false;
    string mterm(_term);
    if (mterm.empty())
        return true; //??

    if (!Rcl::Db::isSpellingCandidate(mterm)) {
        LOGDEB0("Aspell::suggest: [" << mterm << " not spelling candidate, return empty/true\n");
        return true;
    }

    if (!o_index_stripchars) {
        string lower;
        if (!unacmaybefold(mterm, lower, UNACOP_FOLD)) {
            LOGERR("Aspell::check : cant lowercase input\n");
            return false;
        }
        mterm.swap(lower);
    }

    m_data->m_speller.send(mterm + "\n");
    std::string line;
    if (m_data->m_speller.getline(line, 3) <= 0) {
        reason.append("Aspell error: ");
        return false;
    }
    LOGDEB1("ASPELL: got answer: " << line << "\n");
    string empty;
    if (m_data->m_speller.getline(empty, 1) <= 0) {
        reason.append("Aspell: failed reading final empty line\n");
        return false;
    }

    if (line[0] == '*' || line[0] == '#') {
        // Word is in dictionary, or there are no suggestions
        return true;
    }
    string::size_type colon;
    // Aspell suggestions line: & original count offset: miss, miss, …
    if (line[0] != '&' || (colon = line.find(':')) == string::npos || colon == line.size()-1) {
        // ??
        reason.append("Aspell: bad answer line: ");
        reason.append(line);
        return false;
    }
    std::vector<std::string> words;
    stringSplitString(line.substr(colon + 2), words, ", ");
    for (const auto& word : words) {
        if (db.termExists(word))
            suggestions.push_back(word);
    }
    return true;
}

#endif // RCL_USE_ASPELL