From 6f27cc9197ccc8521a80aabba7d0d904773f5554 Mon Sep 17 00:00:00 2001 From: Max Gaukler Date: Fri, 25 Oct 2024 20:09:00 +0200 Subject: [PATCH] PDF import: Fix invalid character encoding in SVG Harden the PDF import against invalid character encoding: - Add many checks for valid encoding of strings - Clean up pdf object names so that they become valid SVG IDs Fixes https://gitlab.com/inkscape/inbox/-/issues/10724 --- .../internal/pdfinput/poppler-utils.cpp | 51 +++++++++++++++++++ .../internal/pdfinput/poppler-utils.h | 1 + .../internal/pdfinput/svg-builder.cpp | 19 +++---- src/id-clash.cpp | 15 ++++-- testfiles/CMakeLists.txt | 1 + .../pdfinput/Annot_w_AP_expected.svg | 2 +- .../pdfinput/XObj_layer_expected.svg | 4 +- testfiles/src/poppler-utils-test.cpp | 40 +++++++++++++++ 8 files changed, 118 insertions(+), 15 deletions(-) create mode 100644 testfiles/src/poppler-utils-test.cpp diff --git a/src/extension/internal/pdfinput/poppler-utils.cpp b/src/extension/internal/pdfinput/poppler-utils.cpp index badab64739..700490d234 100644 --- a/src/extension/internal/pdfinput/poppler-utils.cpp +++ b/src/extension/internal/pdfinput/poppler-utils.cpp @@ -553,7 +553,58 @@ FontList getPdfFonts(std::shared_ptr pdf_doc) return fontsList; } +/** + * Convert arbitrary string (e.g. group name in PDF) to a valid SVG ID. + * + * This function guarantees the following: + * - The result is a valid SVG ID. + * - Two different inputs can never lead to the same output, so ID collisions are avoided. + * (Mathematically, the function is invertible.) + * - Valid SVG IDs are preserved if they only use the characters a-z A-Z 0-9. + * + * It does *not* guarantee that all other valid input SVG IDs are preserved. + * (This would be impossible together with the above guarantees.) + * + * See also sanitize_id() in id-clash.cpp for a less aggressive version that is, however, not collision-free. + */ +std::string sanitizeId(std::string const &in) +{ + // XML allows IDs of the form [a-zA-Z_:][a-zA-Z0-9\-_\.:]* plus some UTF8 characters. + // Here we restrict us to the subset [a-zA-Z_][a-zA-Z0-9_]*, + // where "_" is used as escape character. + // https://www.w3.org/TR/2008/REC-xml-20081126/#id + // https://stackoverflow.com/questions/1077084/what-characters-are-allowed-in-dom-ids#1077111 + + if (in.empty()) { + return "_"; + } + if (isalpha(in[0]) && std::find_if_not(in.begin(), in.end(), isalnum) == in.end()) { + [[likely]]; + // Fast path: + // Input is of the form [a-zA-Z][a-zA-Z0-9]* + // --> Return unchanged. + return in; + } + + // Slow path: Escape anything non-alphanumeric, + // e.g., "a bc" as a_20bc + std::ostringstream outStream; + for (char chr : in) { + if (isalnum(chr)) { + outStream.put(chr); + } else { + outStream.put('_'); + outStream << std::hex << ((unsigned int)chr & 0xff); + } + } + return outStream.str(); +} +/** + * Ensure string is valid UTF8. + * If not, return an empty string. + * (This could be changed in the future to only remove the problematic characters.) + */ std::string validateString(std::string const &in) { if (g_utf8_validate(in.c_str(), -1, nullptr)) { diff --git a/src/extension/internal/pdfinput/poppler-utils.h b/src/extension/internal/pdfinput/poppler-utils.h index 3df3045fbe..705665a8cd 100644 --- a/src/extension/internal/pdfinput/poppler-utils.h +++ b/src/extension/internal/pdfinput/poppler-utils.h @@ -80,6 +80,7 @@ std::string getDictString(Dict *dict, const char *key); std::string getString(const std::unique_ptr &value); std::string getString(const GooString *value); std::string validateString(std::string const &in); +std::string sanitizeId(std::string const &in); // Replacate poppler FontDict class InkFontDict diff --git a/src/extension/internal/pdfinput/svg-builder.cpp b/src/extension/internal/pdfinput/svg-builder.cpp index 93aa21ba92..3c40098ab8 100644 --- a/src/extension/internal/pdfinput/svg-builder.cpp +++ b/src/extension/internal/pdfinput/svg-builder.cpp @@ -136,7 +136,7 @@ void SvgBuilder::pushPage(const std::string &label, GfxState *state) _page->setAttributeSvgDouble("y", _page_top); if (!label.empty()) { - _page->setAttribute("inkscape:label", label); + _page->setAttribute("inkscape:label", validateString(label)); } auto _nv = _doc->getNamedView()->getRepr(); _nv->appendChild(_page); @@ -248,7 +248,7 @@ void SvgBuilder::setMargins(const Geom::Rect &page, const Geom::Rect &margins, c void SvgBuilder::setMetadata(char const *name, const std::string &content) { if (name && !content.empty()) { - rdf_set_work_entity(_doc, rdf_find_entity(name), content.c_str()); + rdf_set_work_entity(_doc, rdf_find_entity(name), validateString(content).c_str()); } } @@ -259,7 +259,7 @@ void SvgBuilder::setAsLayer(const char *layer_name, bool visible) { _container->setAttribute("inkscape:groupmode", "layer"); if (layer_name) { - _container->setAttribute("inkscape:label", layer_name); + _container->setAttribute("inkscape:label", validateString(layer_name)); } if (!visible) { SPCSSAttr *css = sp_repr_css_attr_new(); @@ -852,7 +852,7 @@ Inkscape::XML::Node *SvgBuilder::_createClip(const std::string &d, const Geom::A void SvgBuilder::beginMarkedContent(const char *name, const char *group) { if (name && group && std::string(name) == "OC") { - auto layer_id = std::string("layer-") + group; + auto layer_id = std::string("layer-") + sanitizeId(group); if (auto existing = _doc->getObjectById(layer_id)) { if (existing->getRepr()->parent() == _container) { _container = existing->getRepr(); @@ -872,7 +872,7 @@ void SvgBuilder::beginMarkedContent(const char *name, const char *group) } else { auto node = _pushGroup(); if (group) { - node->setAttribute("id", std::string("group-") + group); + node->setAttribute("id", std::string("group-") + sanitizeId(group)); } } } @@ -884,8 +884,9 @@ void SvgBuilder::addOptionalGroup(const std::string &oc, const std::string &labe Inkscape::XML::Node *SvgBuilder::beginLayer(const std::string &label, bool visible) { + auto id = sanitizeId(label); Inkscape::XML::Node *save_current_location = _container; - if (auto existing = _doc->getObjectById(label)){ + if (auto existing = _doc->getObjectById(id)) { _container = existing->getRepr(); _node_stack.push_back(_container); } else { @@ -893,9 +894,9 @@ Inkscape::XML::Node *SvgBuilder::beginLayer(const std::string &label, bool visib _popGroup(); } auto node = _pushGroup(); - node->setAttribute("id", label.c_str()); + node->setAttribute("id", id); setAsLayer(label.c_str(), visible); - } + } return save_current_location; } @@ -933,7 +934,7 @@ std::string SvgBuilder::_getColorProfile(cmsHPROFILE hp) return _icc_profiles[hp]; auto profile = Colors::CMS::Profile::create(hp); - std::string name = profile->getName(); + std::string name = validateString(profile->getName()); // Find the named profile in the document (if already added) if (_doc->getDocumentCMS().getSpace(name)) diff --git a/src/id-clash.cpp b/src/id-clash.cpp index 5044a5f928..b054b7dcfb 100644 --- a/src/id-clash.cpp +++ b/src/id-clash.cpp @@ -494,12 +494,21 @@ change_def_references(SPObject *from_obj, SPObject *to_obj) } } -// Supposedly this is a list of valid XML 1.0 ID characters -// TODO: find link to some reference page +// This is a subset of the valid XML 1.0 ID characters. const char valid_id_chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.:"; +/** + * Convert string to valid XML id. + * The input is returned unchanged if it is a valid ID and only uses ASCII characters. + * The returned ID is not guaranteed to be unique. Different inputs can map to the same output. + */ Glib::ustring sanitize_id(const Glib::ustring& input_id) { - if (input_id.empty()) return input_id; + // XML allows IDs of the form [a-zA-Z_:][a-zA-Z0-9\-_\.:]* plus some UTF8 characters. + // We restrict us to the ASCII subset for simplicity. + // https://www.w3.org/TR/2008/REC-xml-20081126/#id + // https://stackoverflow.com/questions/1077084/what-characters-are-allowed-in-dom-ids#1077111 + if (input_id.empty()) + return "_"; auto id = input_id; for (auto pos = id.find_first_not_of(valid_id_chars); diff --git a/testfiles/CMakeLists.txt b/testfiles/CMakeLists.txt index 0042850474..2dd373c2c6 100644 --- a/testfiles/CMakeLists.txt +++ b/testfiles/CMakeLists.txt @@ -90,6 +90,7 @@ set(TEST_SOURCES util-test drag-and-drop-svgz drawing-pattern-test + poppler-utils-test extract-uri-test attributes-test dir-util-test diff --git a/testfiles/cli_tests/testcases/pdfinput/Annot_w_AP_expected.svg b/testfiles/cli_tests/testcases/pdfinput/Annot_w_AP_expected.svg index 142b78009b..414eb6e92f 100644 --- a/testfiles/cli_tests/testcases/pdfinput/Annot_w_AP_expected.svg +++ b/testfiles/cli_tests/testcases/pdfinput/Annot_w_AP_expected.svg @@ -486,7 +486,7 @@ + +namespace Inkscape { + +TEST(PopplerUtilsTest, SanitizeId) +{ + ASSERT_EQ(sanitizeId(""), "_"); + ASSERT_EQ(sanitizeId("hello"), "hello"); + ASSERT_EQ(sanitizeId("a bc"), "a_20bc"); + ASSERT_EQ(sanitizeId("a\xff" + "bc"), + "a_ffbc"); +} +} // namespace Inkscape + +/* + Local Variables: + mode:c++ + c-file-style:"stroustrup" + c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . +)) + indent-tabs-mode:nil + fill-column:99 + End: +*/ +// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:fileencoding=utf-8:textwidth=99 : -- GitLab