foray-commit Mailing List for FOray

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 13203
          http://sourceforge.net/p/foray/code/13203
Author:   victormote
Date:     2023-08-31 13:24:58 +0000 (Thu, 31 Aug 2023)
Log Message:
-----------
Improvements to orthography data.

Modified Paths:
--------------
    trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
    trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml
    trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java

Modified: trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml
===================================================================

--- trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml	2023-08-31 11:23:52 UTC (rev 13202)
+++ trunk/foray/foray-orthography/src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml	2023-08-31 13:24:58 UTC (rev 13203)
@@ -56,8 +56,12 @@
 -->
 
 <w><t>&amp;</t></w>
-<w><t>&amp;c.</t><abbrev referenced-word="etc., et cetera"/></w>
+<w><t>&amp;c</t><abbrev referenced-word="etc., et cetera"/></w>
 <w><t>a</t></w>
+<w><t>a.d.</t><abbrev referenced-word="anno Domini"/><comment>Latin "year of our Lord"</comment></w>
+<w><t>a.&#xA0;d.</t><abbrev referenced-word="anno Domini"/><comment>Latin "year of our Lord"</comment></w>
+<w><t>a.m.</t><abbrev referenced-word="ante meridian"/><comment>Latin "before noon"</comment></w>
+<w><t>a.&#xA0;m.</t><abbrev referenced-word="ante meridian"/><comment>Latin "before noon"</comment></w>
 <w><t>a-a</t></w>
 <w><t>Aa-chen</t></w>
 <w><t>Aal-borg</t></w>
@@ -2251,7 +2255,7 @@
 <w><t>ad-ver-tis-a-ble</t></w>
 <w><t>ad-ver-tise</t></w>
 <w><t>ad-ver-tised</t></w>
-<w><t>ad-ver-tise-ment</t></w>
+<w><t>ad-ver-tise-ment</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
 <w><t>ad-ver-tis-er</t></w>
 <w><t>ad-ver-tis-ing</t></w>
 <w><t>ad-ver-tiz-a-ble</t></w>
@@ -13474,7 +13478,7 @@
 <phrase><t>bar mitz-vah</t></phrase>
 <w><t>barm-y</t></w>
 <w><t>bar-my</t></w>
-<w><t>barn</t></w>
+<w><t>barn</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
 <w><t>Bar-na-bas</t></w>
 <w><t>Bar-na-by</t></w>
 <w><t>bar-na-cle</t></w>
@@ -70938,7 +70942,7 @@
 <w><t>hoo-li-gan</t></w>
 <w><t>hoo-li-gan-ism</t></w>
 <w><t>hoo-ly</t></w>
-<w><t>hoop</t></w>
+<w><t>hoop</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
 <w><t>hoop-er</t></w>
 <w><t>Hoop-e-rat-ing</t></w>
 <w><t>Hoopes-ton</t></w>
@@ -81099,6 +81103,7 @@
 <w><t>Jeb-u-sit-ic</t></w>
 <w><t>Jed-burgh</t></w>
 <w><t>Jed-da</t></w>
+<w><t>Je-de-di-ah</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
 <w><t>jee</t></w>
 <w><t>jee-ing</t></w>
 <w><t>jeep</t></w>
@@ -98316,7 +98321,7 @@
 <w><t>Mo-ti-lal</t></w>
 <w><t>mo-tile</t></w>
 <w><t>mo-til-i-ty</t></w>
-<w><t>mo-tion</t></w>
+<w><t>mo-tion</t><noun><pluralizable/><convertible-to-possessive/></noun></w>
 <w><t>mo-tion-al</t></w>
 <w><t>mo-tion-er</t></w>
 <w><t>mo-tion-less</t></w>
@@ -114039,6 +114044,8 @@
 <w><t>p/c</t></w>
 <w><t>P/C</t></w>
 <w><t>P/N</t></w>
+<w><t>p.m.</t><abbrev referenced-word="post meridian"/><comment>Latin "after noon"</comment></w>
+<w><t>p.&#xA0;m.</t><abbrev referenced-word="post meridian"/><comment>Latin "after noon"</comment></w>
 <w><t>paal</t></w>
 <w><t>Paa-si-ki-vi</t></w>
 <w><t>PABA</t></w>
@@ -152332,6 +152339,7 @@
 <w><t>SRO</t></w>
 <w><t>Srta</t></w>
 <w><t>sru-ti</t></w>
+<w><t>ss</t><abbrev referenced-word="scilicet"/><comment>Latin "in particular"</comment></w>
 <w><t>SSB</t></w>
 <w><t>SSC</t></w>
 <w><t>SSE</t></w>

Modified: trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml
===================================================================
--- trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml	2023-08-31 11:23:52 UTC (rev 13202)
+++ trunk/foray/foray-orthography/src/main/data/orthographies/foray-orthography-config.xml	2023-08-31 13:24:58 UTC (rev 13203)
@@ -7,12 +7,17 @@
 <axsl-orthography-config>
 
   <explicit-token-list id="eng-Latn-explicit-tokens">
+    <explicit-token end-of-sentence="never">a\.d\.</explicit-token>
+    <explicit-token end-of-sentence="never">a\.&#xA0;d\.</explicit-token>
+    <explicit-token end-of-sentence="never">a\.m\.</explicit-token>
+    <explicit-token end-of-sentence="never">a\.&#xA0;m\.</explicit-token>
     <explicit-token end-of-sentence="never">i\.e\.</explicit-token>
     <explicit-token end-of-sentence="never">i\.&#xA0;e\.</explicit-token>
-    <explicit-token end-of-sentence="never">&amp;c\.</explicit-token>
     <explicit-token end-of-sentence="never">l\.&#xA0;s\.</explicit-token>
     <explicit-token end-of-sentence="never">e\.g\.</explicit-token>
     <explicit-token end-of-sentence="never">e\.&#xA0;g\.</explicit-token>
+    <explicit-token end-of-sentence="never">p\.m\.</explicit-token>
+    <explicit-token end-of-sentence="never">p\.&#xA0;m\.</explicit-token>
   </explicit-token-list>
 
   <match-rule-list id="eng-Latn-match-rules">

Modified: trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java
===================================================================
--- trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java	2023-08-31 11:23:52 UTC (rev 13202)
+++ trunk/foray/foray-orthography/src/main/java/org/foray/orthography/Lexer4a.java	2023-08-31 13:24:58 UTC (rev 13203)
@@ -51,8 +51,28 @@
  * This arises from the ambiguity of the FULL_STOP or period "." character being used both as a signal for full stop and
  * as a signal for an abbreviation.</p>
  *
- * <p>The general process used by implementation of this class is as follows:</p>
+ * <p>Solutions considered were:</p>
+ * <ul>
+ *   <li>ICU4J handles the problem at the sentence level, but not at the word level.
+ *     (See test org.foray.orthography.LexerEnglishIcu4jTests#testEmbeddedAbbreviation1() where this is demonstrated).
+ *     One possible solution is to use ICU4J to first break text into sentences, then break each sentence into words.
+ *     ICU4J appears to use the Unicode Common Locale Data Repository (CLDR) to manage the data related to this issue.
+ *     CLDR maintains the abbreviation data in XML files found in common/segments in elements found at XPath
+ *     ldml/segmentations/segmentation/suppressions/suppression.
+ *     It is doubtful that relying on this data will be flexible enough to handle abbreviations that will be needed for
+ *     the tasks this lexer needs to support, especially spell-checking.
+ *     Modifying the segments data file would mean managing additions we make as diffs to that data, and would introduce
+ *     the need to build ICU4J as part of our build process.</li>
+ *   <li>Modify our word-breaking algorithm to identify the ambiguous characters, read backward to the beginning of the
+ *     word, and lookup our own abbreviations database for a match.</li>
+ *   <li>Do a special search for explicit tokens that are identified for the orthography.</li>
+ * </ul>
+ * <p>The current solution employed is the last.</p>
+ *
+ *
+ * <p>The general process used in this class is as follows:</p>
  * <ol>
+ *   <li>Do a special search for any explicit tokens (such as "i.e." mentioned above).</li>
  *   <li>Find raw break boundaries. This task can be delegated to {@link java.text.BreakIterator} or ICU4J's similar
  *   class {@link com.ibm.icu.text.BreakIterator}, which use the Unicode text segmentation algorithms to find the
  *   boundaries that they report.</li>
@@ -61,21 +81,6 @@
  *   <li>Tokenize the text based on the refined boundary types.</li>
  * </ol>
  *
- * <p>TODO: We need to handle the abbreviation problem mentioned above. ICU4J handles the problem at the sentence level,
- * but not at the word level.
- * (See test org.foray.orthography.LexerEnglishIcu4jTests#testEmbeddedAbbreviation1() where this is demonstrated).
- * One possible solution is to use ICU4J to first break text into sentences, then break each sentence into words.
- * ICU4J appears to use the Unicode Common Locale Data Repository (CLDR) to manage the data related to this issue.
- * CLDR maintains the abbreviation data in XML files found in common/segments in elements found at XPath
- * ldml/segmentations/segmentation/suppressions/suppression.
- * It is doubtful that relying on this data will be flexible enough to handle abbreviations that will be needed for the
- * tasks this lexer needs to support, especially spell-checking.
- * Modifying the segments data file would mean managing additions we make as diffs to that data, and would introduce
- * the need to build ICU4J as part of our build process.
- * It may make sense to modify our word-breaking algorithm instead, to identify the ambiguous characters, read backward
- * to the beginning of the word, and lookup our own abbreviations database for a match.
- * Whatever solution is chosen, we need to remove the klunky workaround entry for "i.e" in
- * src/main/data/dictionaries/eng-Latn-ZZZ.dict.xml.</p>
  *
  * @see <a href="https://www.unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard Annex #29, Unicode Text
  * Segmentation</a>

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





2006	Jan	Feb	Mar (139)	Apr (98)	May (250)	Jun (394)	Jul (84)	Aug (13)	Sep (420)	Oct (186)	Nov (1)	Dec (3)
2007	Jan (108)	Feb (202)	Mar (291)	Apr (247)	May (374)	Jun (227)	Jul (231)	Aug (60)	Sep (31)	Oct (45)	Nov (18)	Dec
2008	Jan (38)	Feb (71)	Mar (142)	Apr	May (59)	Jun (6)	Jul (10)	Aug	Sep	Oct	Nov	Dec
2009	Jan (12)	Feb (4)	Mar (88)	Apr (121)	May (17)	Jun (30)	Jul	Aug (5)	Sep	Oct (1)	Nov	Dec
2010	Jan (11)	Feb (76)	Mar (11)	Apr	May (11)	Jun	Jul	Aug (44)	Sep (14)	Oct (7)	Nov	Dec
2011	Jan	Feb	Mar	Apr	May (9)	Jun	Jul	Aug	Sep	Oct (10)	Nov	Dec
2012	Jan	Feb	Mar	Apr	May	Jun (3)	Jul (4)	Aug	Sep	Oct	Nov	Dec
2016	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec (168)
2017	Jan (77)	Feb (11)	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec
2018	Jan	Feb	Mar (1)	Apr (6)	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec
2019	Jan	Feb (88)	Mar (118)	Apr (1)	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec
2020	Jan	Feb	Mar	Apr	May (6)	Jun	Jul	Aug	Sep	Oct	Nov	Dec (141)
2021	Jan (170)	Feb (20)	Mar	Apr	May	Jun	Jul (1)	Aug	Sep	Oct (62)	Nov (189)	Dec (162)
2022	Jan (201)	Feb (118)	Mar (8)	Apr	May (2)	Jun (47)	Jul (19)	Aug (14)	Sep (3)	Oct	Nov (28)	Dec (235)
2023	Jan (112)	Feb (23)	Mar (2)	Apr (2)	May	Jun (1)	Jul	Aug (70)	Sep (92)	Oct (20)	Nov (1)	Dec (1)
2024	Jan	Feb	Mar (1)	Apr (1)	May (14)	Jun (11)	Jul (1)	Aug	Sep	Oct	Nov	Dec
2025	Jan (10)	Feb (29)	Mar	Apr (162)	May (245)	Jun (83)	Jul	Aug (1)	Sep	Oct	Nov	Dec

S	M	T	W	T	F	S
		1	2	3	4 (1)	5 (3)
6	7 (1)	8 (15)	9 (1)	10	11	12 (4)
13	14 (2)	15	16	17	18	19
20	21	22 (3)	23 (4)	24 (5)	25 (6)	26 (3)
27 (3)	28 (4)	29 (5)	30 (6)	31 (4)

foray-commit Mailing List for FOray

Modular XSL-FO Implementation for Java.

foray-commit — FOray repository commit log messages