From b15961f358452ae3f5ad8eb6120b1cabf93cb754 Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Fri, 18 Jul 2025 20:15:31 +0200 Subject: [PATCH 01/11] Create new mention matcher & parser --- CHANGELOG.md | 2 + .../net/folivo/trixnity/core/MatrixRegex.kt | 314 +-------- .../net/folivo/trixnity/core/model/Mention.kt | 28 +- .../folivo/trixnity/core/util/MatrixLinks.kt | 72 ++ .../net/folivo/trixnity/core/util/Patterns.kt | 341 +++++++++ .../folivo/trixnity/core/MatrixLinkTest.kt | 304 ++++++++ .../folivo/trixnity/core/MatrixRegexTest.kt | 661 ++++-------------- 7 files changed, 881 insertions(+), 841 deletions(-) create mode 100644 trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt create mode 100644 trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt create mode 100644 trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixLinkTest.kt diff --git a/CHANGELOG.md b/CHANGELOG.md index a06f06061..f3e19d93e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- New mention matcher & parser + ### Deprecated ### Removed diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt index 3234cbd97..5a0baddc5 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt @@ -1,297 +1,51 @@ package net.folivo.trixnity.core -import io.ktor.http.* -import net.folivo.trixnity.core.model.* +import net.folivo.trixnity.core.model.Mention +import net.folivo.trixnity.core.model.RoomAliasId +import net.folivo.trixnity.core.model.UserId +import net.folivo.trixnity.core.util.MatrixLinks +import net.folivo.trixnity.core.util.Patterns object MatrixRegex { - // Decode/Encode Grammar - private fun makeSymboleRegex(symbole: String, code: String) = "(?:(?:$symbole)|(?:$code))" - private val exlMark = makeSymboleRegex("!", "%21") - private val dollar = makeSymboleRegex("\\$", "%24") - private val at = makeSymboleRegex("@", "%40") - private val hash = makeSymboleRegex("#", "%23") - private val colon = makeSymboleRegex(":", "%3A") - private val qesMark = makeSymboleRegex("\\?", "%3F") - private val eq = makeSymboleRegex("=", "%3D") - private val amp = makeSymboleRegex("&", "%26") - - // https://spec.matrix.org/v1.11/appendices/#common-namespaced-identifier-grammar - private val namespaceIdRegex = """(?!m\.)[a-z][a-z0-9-_.]{1,254}""" - - // https://spec.matrix.org/v1.11/appendices/#user-identifiers - private val userLocalpartRegex = """(?:[0-9a-z-=_/+.]+)""" - - // https://github.com/matrix-org/matrix-spec-proposals/blob/human-id-rules/drafts/human-id-rules.rst - private val roomAliasLocalpartRegex = """(?:[^:\s]+)""" - - // https://spec.matrix.org/v1.11/appendices/#opaque-identifiers - private val opaqueIdRegex = """(?:[0-9A-Za-z-._~]+)""" - - // https://spec.matrix.org/v1.11/appendices/#server-name - private const val basePortRegex = """:[0-9]{1,5}""" - private const val baseDnsRegex = """(?:[\w-]+\.)+[\w-]+""" - private const val baseIPV4Regex = """\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}""" - private const val baseIPV6Regex = """\[[0-9a-fA-F:]+\]""" - private const val servernameRegex = - """(?:(?:$baseIPV4Regex)|(?:$baseDnsRegex)|(?:$baseIPV6Regex))(?:$basePortRegex)?""" - - - // https://spec.matrix.org/v1.11/appendices/#user-identifiers - private val userIdRegex = """@($userLocalpartRegex):($servernameRegex)""" - - // https://spec.matrix.org/v1.11/appendices/#room-ids - private val roomIdRegex = """!($opaqueIdRegex):($servernameRegex)""" // TODO is opaque in future room versions - - // https://spec.matrix.org/v1.11/appendices/#room-aliases - private val roomAliasRegex = """#($roomAliasLocalpartRegex):($servernameRegex)""" - - // https://spec.matrix.org/v1.11/appendices/#event-ids - private val eventIdRegex = """\$($opaqueIdRegex(?::$servernameRegex)?)""" - - // https://spec.matrix.org/v1.11/appendices/#matrix-uri-scheme - private val queryParameterRegex = - """($qesMark$namespaceIdRegex$eq([^\s&]+)(?:$amp$namespaceIdRegex$eq([^\s&]+))*)""" - private val userUriRegex = - """matrix:u\/($userLocalpartRegex):($servernameRegex)$queryParameterRegex?""" - private val roomIdUriRegex = - """matrix:roomid\/($opaqueIdRegex):($servernameRegex)$queryParameterRegex?""" - private val roomAliasUriRegex = - """matrix:r\/($roomAliasLocalpartRegex):($servernameRegex)$queryParameterRegex?""" - private val eventUriRegex = - """matrix:(roomid\/$opaqueIdRegex:$servernameRegex)\/e\/($opaqueIdRegex)$queryParameterRegex?""" - - // https://spec.matrix.org/v1.11/appendices/#matrixto-navigation - private val viaArgumentRegex = """(?:\?(via=$servernameRegex))""" - private val matrixToRegex = """https?:\/\/matrix\.to\/$hash\/""" - private val userPermalinkRegex = - """$matrixToRegex$at($userLocalpartRegex)$colon($servernameRegex)$viaArgumentRegex?""" - - // see implementation note - private val roomIdPermalinkRegex = - """$matrixToRegex$exlMark($opaqueIdRegex)$colon($servernameRegex)$viaArgumentRegex?""" - private val roomAliasPermalinkRegex = - """$matrixToRegex$hash($roomAliasLocalpartRegex)$colon($servernameRegex)$viaArgumentRegex?""" - private val eventPermalinkRegex = - """$matrixToRegex($exlMark$opaqueIdRegex$colon$servernameRegex)\/$dollar($opaqueIdRegex(?:$colon$servernameRegex)?)$viaArgumentRegex?""" - - private fun getAnchor(regex: String, maxLength: Int): String = - "(.*?)<\\/a>" - - val domain by lazy { servernameRegex.toRegex() } - val userIdLocalpart by lazy { userLocalpartRegex.toRegex(255) } - val roomAliasLocalpart by lazy { roomAliasLocalpartRegex.toRegex(255) } - val opaqueId by lazy { opaqueIdRegex.toRegex(255) } - val namespacedId by lazy { namespaceIdRegex.toRegex() } - - val userId by lazy { userIdRegex.toRegex(255) } - val roomId by lazy { roomIdRegex.toRegex(255) } - val roomAlias by lazy { roomAliasRegex.toRegex(255) } - val eventId by lazy { eventIdRegex.toRegex(255) } - - val userIdUri by lazy { userUriRegex.toRegex(255) } - val roomIdUri by lazy { roomIdUriRegex.toRegex(255) } - val roomAliasUri by lazy { roomAliasUriRegex.toRegex(255) } - val eventIdUri by lazy { eventUriRegex.toRegex(255) } - - private val userIdPermalink by lazy { userPermalinkRegex.toRegex(255) } - private val roomIdPermalink by lazy { roomIdPermalinkRegex.toRegex(255) } - private val roomAliasPermalink by lazy { roomAliasPermalinkRegex.toRegex(255) } - private val eventIdPermalink by lazy { eventPermalinkRegex.toRegex(255) } - - internal val userIdPermalinkAnchor by lazy { getAnchor(userPermalinkRegex, 255).toRegex() } - internal val roomIdPermalinkAnchor by lazy { getAnchor(roomIdPermalinkRegex, 255).toRegex() } - internal val roomAliasPermalinkAnchor by lazy { getAnchor(roomAliasPermalinkRegex, 255).toRegex() } - internal val eventIdPermalinkAnchor by lazy { getAnchor(eventPermalinkRegex, 255).toRegex() } - - private val userIdUriAnchor by lazy { getAnchor(userUriRegex, 255).toRegex() } - private val roomIdUriAnchor by lazy { getAnchor(roomIdUriRegex, 255).toRegex() } - private val roomAliasUriAnchor by lazy { getAnchor(roomAliasUriRegex, 255).toRegex() } - private val eventIdUriAnchor by lazy { getAnchor(eventUriRegex, 255).toRegex() } + // language=Regexp + private const val ID_PATTERN = """[@#][0-9a-z\-.=_/+]+:(?:[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|\[[0-9a-fA-F:.]{2,45}]|[0-9a-zA-Z\-.]{1,255})(?::[0-9]{1,5})?""" + private val idRegex = ID_PATTERN.toRegex() fun findMentions(message: String): Map { - val mentions = findUserIdMentions(message) - .plus(findRoomIdMentions(message)) - .plus(findRoomAliasMentions(message)) - .plus(findEventMentions(message)) - - val uniqueMentions = mentions.filter { mention -> - mentions.forEach { - if (it.key.contains(mention.key)) { - return@filter false + val links = findLinkMentions(message) + val users = findIdMentions(message) + val linksRange = links.keys.sortedBy { it.first } + val uniqueUsers = users.filter { (user, _) -> + val index = linksRange.binarySearch { link -> + when { + link.first > user.first -> -1 + link.last < user.last -> 1 + else -> 0 } } - - true - } - - return uniqueMentions - } - - private fun findUserIdMentions(message: String): Map { - fun handleMention(result: List, options: List): Mention.User { - val match = result[0] - val localpart = result[1] - val domain = result[2] - val (params, label) = parseOptions(options, anchor = match.startsWith("")) - - return Mention.User(UserId(localpart, domain), match, params, label) - } - - val ids = findMention(userId, message, ::handleMention) - val uris = findMention(userIdUri, message, ::handleMention) - val uriAnchors = findMention(userIdUriAnchor, message, ::handleMention) - val links = findMention(userIdPermalink, message, ::handleMention) - val linkAnchors = findMention(userIdPermalinkAnchor, message, ::handleMention) - - return ids + uris + uriAnchors + links + linkAnchors - } - - private fun findRoomIdMentions(message: String): Map { - fun handleMention( - result: List, - options: List - ): Mention.Room { - val match = result[0] - val localpart = result[1] - val domain = result[2] - val (params, label) = parseOptions(options, anchor = match.startsWith("")) - - // TODO is opaque String in future room versions - return Mention.Room(RoomId(localpart, domain), match, params, label) - } - - val ids = findMention(roomId, message, ::handleMention) - val uris = findMention(roomIdUri, message, ::handleMention) - val uriAnchors = findMention(roomIdUriAnchor, message, ::handleMention) - val links = findMention(roomIdPermalink, message, ::handleMention) - val linkAnchors = findMention(roomIdPermalinkAnchor, message, ::handleMention) - - return ids + uris + uriAnchors + links + linkAnchors - } - - private fun findRoomAliasMentions(message: String): Map { - fun handleMention(result: List, options: List): Mention.RoomAlias { - val match = result[0] - val localpart = result[1] - val domain = result[2] - val (params, label) = parseOptions(options, anchor = match.startsWith("")) - - return Mention.RoomAlias(RoomAliasId(localpart, domain), match, params, label) + index < 0 } - - val aliases = findMention(roomAlias, message, ::handleMention) - val uris = findMention(roomAliasUri, message, ::handleMention) - val uriAnchors = findMention(roomAliasUriAnchor, message, ::handleMention) - val links = findMention(roomAliasPermalink, message, ::handleMention) - val linkAnchors = findMention(roomAliasPermalinkAnchor, message, ::handleMention) - - return aliases + uris + uriAnchors + links + linkAnchors + return links.plus(uniqueUsers).toMap() } - private fun findEventMentions(message: String): Map { - val ids = eventId.findAll(message).associate { - val result = it.groupValues.filter(String::isNotBlank) - - val match = result[0] - val eventId = result[1] - - it.range to Mention.Event(eventId = EventId("$$eventId"), match = match) - } - - val uris = eventIdUri.findAll(message).associate { - val (result, options) = it.groupValues.filter(String::isNotBlank).let { - it.take(3) to it.drop(3) - } - - val match = result[0] - val roomId = result[1].replaceFirst("roomid/", "!") - val eventId = result[2] - val (params, _) = parseOptions(options, false) - - it.range to Mention.Event(RoomId(roomId), EventId("$$eventId"), match, parameters = params) - } - - val uriAnchors = eventIdUriAnchor.findAll(message).associate { - val (result, options) = it.groupValues.filter(String::isNotBlank).let { - it.take(3) to it.drop(3) - } - - val match = result[0] - val roomId = result[1].replaceFirst("roomid/", "!") - val eventId = result[2] - val (params, label) = parseOptions(options, true) - - it.range to Mention.Event( - RoomId(roomId), - EventId("$$eventId"), - match, - parameters = params, - label = label - ) - } - - val links = eventIdPermalink.findAll(message).associate { - val (result, options) = it.groupValues.filter(String::isNotBlank).let { - it.take(3) to it.drop(3) - } - - val match = result[0] - val roomId = result[1] - val eventId = result[2] - val (params, _) = parseOptions(options, false) - - it.range to Mention.Event(RoomId(roomId.decodeURLPart()), EventId("$$eventId"), match, parameters = params) - } - - val linkAnchors = eventIdPermalinkAnchor.findAll(message).associate { - val (result, options) = it.groupValues.filter(String::isNotBlank).let { - it.take(3) to it.drop(3) - } - - val match = result[0] - val roomId = result[1] - val eventId = result[2] - val (params, label) = parseOptions(options, true) - - it.range to Mention.Event( - RoomId(roomId.decodeURLPart()), - EventId("$$eventId"), - match, - parameters = params, - label = label - ) - } - - return ids + uris + uriAnchors + links + linkAnchors + fun findIdMentions(content: String): Map { + return idRegex.findAll(content) + .filter { it.range.last - it.range.first <= 255 } + .mapNotNull { Pair(it.range, parseMatrixId(it.value) ?: return@mapNotNull null) } + .toMap() } - fun parseOptions(options: List, anchor: Boolean): Pair { - val params = options.firstOrNull()?.let { - Parameters.build { - parseQueryString(it).forEach { key, values -> - this.appendAll(key.removePrefix("%3F").removePrefix("?"), values) - } - } - } - val label = if (anchor) options.last() else null - - return params to label + fun findLinkMentions(content: String): Map { + return Patterns.AUTOLINK_MATRIX_URI.findAll(content).mapNotNull { + Pair(it.range, MatrixLinks.parse(it.value) ?: return@mapNotNull null) + }.toMap() } - fun findMention( - regex: Regex, - message: String, - handle: (List, List) -> Mention - ): Map { - return regex.findAll(message).associate { match -> - match.range to match.groupValues.filter(String::isNotBlank).let { - handle(it.take(3), it.drop(3)) - } + private fun parseMatrixId(id: String): Mention? { + return when { + id.startsWith(UserId.sigilCharacter) -> Mention.User(UserId(id)) + id.startsWith(RoomAliasId.sigilCharacter) -> Mention.RoomAlias(RoomAliasId(id)) + else -> null } } -} - -private fun String.toRegex(maxLength: Int) = "(?!.{${maxLength + 1},})$this".toRegex() -private fun IntRange.contains(other: IntRange): Boolean = - this.start <= other.start && other.endInclusive <= this.endInclusive && this != other - +} \ No newline at end of file diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/model/Mention.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/model/Mention.kt index 1466e8e8f..1ffecedfd 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/model/Mention.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/model/Mention.kt @@ -6,32 +6,18 @@ import io.ktor.http.* * Represents a mention. A mention can refer to various entities and potentially include actions associated with them. */ sealed interface Mention { - - /** - * The textual representation of the mention within the message it appears. - * Use with care, IntRange preferred - */ - val match: String - /** * If exists, the parameters provided in the URI */ val parameters: Parameters? - /** - * The optional display name associated with the mention, if applicable. - */ - val label: String? - /** * Represents a mention of a user. */ data class User( val userId: UserId, - override val match: String, - override val parameters: Parameters? = null, - override val label: String? = null + override val parameters: Parameters? = parametersOf() ) : Mention /** @@ -39,9 +25,7 @@ sealed interface Mention { */ data class Room( val roomId: RoomId, - override val match: String, - override val parameters: Parameters? = null, - override val label: String? = null + override val parameters: Parameters? = parametersOf() ) : Mention /** @@ -49,9 +33,7 @@ sealed interface Mention { */ data class RoomAlias( val roomAliasId: RoomAliasId, - override val match: String, - override val parameters: Parameters? = null, - override val label: String? = null + override val parameters: Parameters? = parametersOf() ) : Mention /** @@ -60,8 +42,6 @@ sealed interface Mention { data class Event( val roomId: RoomId? = null, val eventId: EventId, - override val match: String, - override val label: String? = null, - override val parameters: Parameters? = null + override val parameters: Parameters? = parametersOf() ) : Mention } \ No newline at end of file diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt new file mode 100644 index 000000000..d5aaa7a66 --- /dev/null +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt @@ -0,0 +1,72 @@ +package net.folivo.trixnity.core.util + +import io.ktor.http.* +import net.folivo.trixnity.core.model.EventId +import net.folivo.trixnity.core.model.Mention +import net.folivo.trixnity.core.model.RoomAliasId +import net.folivo.trixnity.core.model.RoomId +import net.folivo.trixnity.core.model.UserId + +object MatrixLinks { + private val matrixProtocol = URLProtocol("matrix", 0) + + fun parse(href: String): Mention? { + val url = Url(href) + if (url.protocol == matrixProtocol) { + return parseMatrixProtocol(url.segments, url.parameters) + } + if (url.protocol == URLProtocol.HTTPS && url.host == "matrix.to" && url.segments.isEmpty()) { + val path = url.fragment.substringBefore('?').removePrefix("/") + val query = url.fragment.substringAfter('?', missingDelimiterValue = "") + val segments = path.removePrefix("/").split('/') + val parameters = parseQueryString(query, decode = false) + return parseMatrixTo(segments, parameters) + } + return null + } + + private fun parseMatrixTo(path: List, parameters: Parameters): Mention? { + val parts = path.map { id -> + when { + id.length > 255 -> null + id.startsWith(RoomAliasId.sigilCharacter) -> RoomAliasId(id) + id.startsWith(RoomId.sigilCharacter) -> RoomId(id) + id.startsWith(UserId.sigilCharacter) -> UserId(id) + id.startsWith(EventId.sigilCharacter) -> EventId(id) + else -> null + } + } + val first = parts.getOrNull(0) + val second = parts.getOrNull(1) + return when { + first is UserId -> Mention.User(first, parameters) + first is RoomAliasId -> Mention.RoomAlias(first, parameters) + first is EventId -> Mention.Event(null, first, parameters) + first is RoomId && second is EventId -> Mention.Event(first, second, parameters) + first is RoomId -> Mention.Room(first, parameters) + else -> null + } + } + + private fun parseMatrixProtocol(path: List, parameters: Parameters): Mention? { + val parts = path.windowed(2, 2).map { (type, id) -> + when { + id.length > 255 -> null + type == "roomid" -> RoomId("!$id") + type == "r" -> RoomAliasId("#$id") + type == "u" -> UserId("@$id") + type == "e" -> EventId("$$id") + else -> null + } + } + val first = parts.getOrNull(0) + val second = parts.getOrNull(1) + return when { + first is UserId -> Mention.User(first, parameters) + first is RoomAliasId -> Mention.RoomAlias(first, parameters) + first is RoomId && second is EventId -> Mention.Event(first, second, parameters) + first is RoomId -> Mention.Room(first, parameters) + else -> null + } + } +} \ No newline at end of file diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt new file mode 100644 index 000000000..14c40cdb2 --- /dev/null +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt @@ -0,0 +1,341 @@ +/* + * Copyright (C) 2016 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package net.folivo.trixnity.core.util + +/** + * Commonly used regular expression patterns. + */ +internal object Patterns { + /** + * Regular expression to match all IANA top-level domains. + * + * List accurate as of 2023/09/11. List taken from: + * http://data.iana.org/TLD/tlds-alpha-by-domain.txt + * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py + */ + // language=RegExp + private const val IANA_TOP_LEVEL_DOMAINS: String = ("(?:" + + "(?:aaa|aarp|abb|abbott|abbvie|abc|able|abogado|abudhabi|academy|accenture|accountant" + + "|accountants|aco|actor|ads|adult|aeg|aero|aetna|afl|africa|agakhan|agency|aig|airbus" + + "|airforce|airtel|akdn|alibaba|alipay|allfinanz|allstate|ally|alsace|alstom|amazon|americanexpress" + + "|americanfamily|amex|amfam|amica|amsterdam|analytics|android|anquan|anz|aol|apartments" + + "|app|apple|aquarelle|arab|aramco|archi|army|arpa|art|arte|asda|asia|associates|athleta" + + "|attorney|auction|audi|audible|audio|auspost|author|auto|autos|avianca|aws|axa|azure" + + "|a[cdefgilmoqrstuwxz])" + + "|(?:baby|baidu|banamex|bananarepublic|band|bank|bar|barcelona|barclaycard|barclays" + + "|barefoot|bargains|baseball|basketball|bauhaus|bayern|bbc|bbt|bbva|bcg|bcn|beats|beauty" + + "|beer|bentley|berlin|best|bestbuy|bet|bharti|bible|bid|bike|bing|bingo|bio|biz|black" + + "|blackfriday|blockbuster|blog|bloomberg|blue|bms|bmw|bnpparibas|boats|boehringer|bofa" + + "|bom|bond|boo|book|booking|bosch|bostik|boston|bot|boutique|box|bradesco|bridgestone" + + "|broadway|broker|brother|brussels|build|builders|business|buy|buzz|bzh|b[abdefghijmnorstvwyz])" + + "|(?:cab|cafe|cal|call|calvinklein|cam|camera|camp|canon|capetown|capital|capitalone" + + "|car|caravan|cards|care|career|careers|cars|casa|case|cash|casino|cat|catering|catholic" + + "|cba|cbn|cbre|cbs|center|ceo|cern|cfa|cfd|chanel|channel|charity|chase|chat|cheap|chintai" + + "|christmas|chrome|church|cipriani|circle|cisco|citadel|citi|citic|city|cityeats|claims" + + "|cleaning|click|clinic|clinique|clothing|cloud|club|clubmed|coach|codes|coffee|college" + + "|cologne|com|comcast|commbank|community|company|compare|computer|comsec|condos|construction" + + "|consulting|contact|contractors|cooking|cool|coop|corsica|country|coupon|coupons|courses" + + "|cpa|credit|creditcard|creditunion|cricket|crown|crs|cruise|cruises|cuisinella|cymru" + + "|cyou|c[acdfghiklmnoruvwxyz])" + + "|(?:dabur|dad|dance|data|date|dating|datsun|day|dclk|dds|deal|dealer|deals|degree" + + "|delivery|dell|deloitte|delta|democrat|dental|dentist|desi|design|dev|dhl|diamonds|diet" + + "|digital|direct|directory|discount|discover|dish|diy|dnp|docs|doctor|dog|domains|dot" + + "|download|drive|dtv|dubai|dunlop|dupont|durban|dvag|dvr|d[ejkmoz])" + + "|(?:earth|eat|eco|edeka|edu|education|email|emerck|energy|engineer|engineering|enterprises" + + "|epson|equipment|ericsson|erni|esq|estate|etisalat|eurovision|eus|events|exchange|expert" + + "|exposed|express|extraspace|e[cegrstu])" + + "|(?:fage|fail|fairwinds|faith|family|fan|fans|farm|farmers|fashion|fast|fedex|feedback" + + "|ferrari|ferrero|fidelity|fido|film|final|finance|financial|fire|firestone|firmdale" + + "|fish|fishing|fit|fitness|flickr|flights|flir|florist|flowers|fly|foo|food|football" + + "|ford|forex|forsale|forum|foundation|fox|free|fresenius|frl|frogans|frontdoor|frontier" + + "|ftr|fujitsu|fun|fund|furniture|futbol|fyi|f[ijkmor])" + + "|(?:gal|gallery|gallo|gallup|game|games|gap|garden|gay|gbiz|gdn|gea|gent|genting" + + "|george|ggee|gift|gifts|gives|giving|glass|gle|global|globo|gmail|gmbh|gmo|gmx|godaddy" + + "|gold|goldpoint|golf|goo|goodyear|goog|google|gop|got|gov|grainger|graphics|gratis|green" + + "|gripe|grocery|group|guardian|gucci|guge|guide|guitars|guru|g[abdefghilmnpqrstuwy])" + + "|(?:hair|hamburg|hangout|haus|hbo|hdfc|hdfcbank|health|healthcare|help|helsinki|here" + + "|hermes|hiphop|hisamitsu|hitachi|hiv|hkt|hockey|holdings|holiday|homedepot|homegoods" + + "|homes|homesense|honda|horse|hospital|host|hosting|hot|hotels|hotmail|house|how|hsbc" + + "|hughes|hyatt|hyundai|h[kmnrtu])" + + "|(?:ibm|icbc|ice|icu|ieee|ifm|ikano|imamat|imdb|immo|immobilien|inc|industries|infiniti" + + "|info|ing|ink|institute|insurance|insure|int|international|intuit|investments|ipiranga" + + "|irish|ismaili|ist|istanbul|itau|itv|i[delmnoqrst])" + + "|(?:jaguar|java|jcb|jeep|jetzt|jewelry|jio|jll|jmp|jnj|jobs|joburg|jot|joy|jpmorgan" + + "|jprs|juegos|juniper|j[emop])" + + "|(?:kaufen|kddi|kerryhotels|kerrylogistics|kerryproperties|kfh|kia|kids|kim|kinder" + + "|kindle|kitchen|kiwi|koeln|komatsu|kosher|kpmg|kpn|krd|kred|kuokgroup|kyoto|k[eghimnprwyz])" + + "|(?:lacaixa|lamborghini|lamer|lancaster|land|landrover|lanxess|lasalle|lat|latino" + + "|latrobe|law|lawyer|lds|lease|leclerc|lefrak|legal|lego|lexus|lgbt|lidl|life|lifeinsurance" + + "|lifestyle|lighting|like|lilly|limited|limo|lincoln|link|lipsy|live|living|llc|llp|loan" + + "|loans|locker|locus|lol|london|lotte|lotto|love|lpl|lplfinancial|ltd|ltda|lundbeck|luxe" + + "|luxury|l[abcikrstuvy])" + + "|(?:madrid|maif|maison|makeup|man|management|mango|map|market|marketing|markets|marriott" + + "|marshalls|mattel|mba|mckinsey|med|media|meet|melbourne|meme|memorial|men|menu|merckmsd" + + "|miami|microsoft|mil|mini|mint|mit|mitsubishi|mlb|mls|mma|mobi|mobile|moda|moe|moi|mom" + + "|monash|money|monster|mormon|mortgage|moscow|moto|motorcycles|mov|movie|msd|mtn|mtr" + + "|museum|music|m[acdeghklmnopqrstuvwxyz])" + + "|(?:nab|nagoya|name|natura|navy|nba|nec|net|netbank|netflix|network|neustar|new|news" + + "|next|nextdirect|nexus|nfl|ngo|nhk|nico|nike|nikon|ninja|nissan|nissay|nokia|norton" + + "|now|nowruz|nowtv|nra|nrw|ntt|nyc|n[acefgilopruz])" + + "|(?:obi|observer|office|okinawa|olayan|olayangroup|oldnavy|ollo|omega|one|ong|onl" + + "|online|ooo|open|oracle|orange|org|organic|origins|osaka|otsuka|ott|ovh|om)" + + "|(?:page|panasonic|paris|pars|partners|parts|party|pay|pccw|pet|pfizer|pharmacy|phd" + + "|philips|phone|photo|photography|photos|physio|pics|pictet|pictures|pid|pin|ping|pink" + + "|pioneer|pizza|place|play|playstation|plumbing|plus|pnc|pohl|poker|politie|porn|post" + + "|pramerica|praxi|press|prime|pro|prod|productions|prof|progressive|promo|properties" + + "|property|protection|pru|prudential|pub|pwc|p[aefghklmnrstwy])" + + "|(?:qpon|quebec|quest|qa)" + + "|(?:racing|radio|read|realestate|realtor|realty|recipes|red|redstone|redumbrella" + + "|rehab|reise|reisen|reit|reliance|ren|rent|rentals|repair|report|republican|rest|restaurant" + + "|review|reviews|rexroth|rich|richardli|ricoh|ril|rio|rip|rocher|rocks|rodeo|rogers|room" + + "|rsvp|rugby|ruhr|run|rwe|ryukyu|r[eosuw])" + + "|(?:saarland|safe|safety|sakura|sale|salon|samsclub|samsung|sandvik|sandvikcoromant" + + "|sanofi|sap|sarl|sas|save|saxo|sbi|sbs|sca|scb|schaeffler|schmidt|scholarships|school" + + "|schule|schwarz|science|scot|search|seat|secure|security|seek|select|sener|services" + + "|seven|sew|sex|sexy|sfr|shangrila|sharp|shaw|shell|shia|shiksha|shoes|shop|shopping" + + "|shouji|show|showtime|silk|sina|singles|site|ski|skin|sky|skype|sling|smart|smile|sncf" + + "|soccer|social|softbank|software|sohu|solar|solutions|song|sony|soy|spa|space|sport" + + "|spot|srl|stada|staples|star|statebank|statefarm|stc|stcgroup|stockholm|storage|store" + + "|stream|studio|study|style|sucks|supplies|supply|support|surf|surgery|suzuki|swatch" + + "|swiss|sydney|systems|s[abcdeghijklmnorstuvxyz])" + + "|(?:tab|taipei|talk|taobao|target|tatamotors|tatar|tattoo|tax|taxi|tci|tdk|team|tech" + + "|technology|tel|temasek|tennis|teva|thd|theater|theatre|tiaa|tickets|tienda|tips|tires" + + "|tirol|tjmaxx|tjx|tkmaxx|tmall|today|tokyo|tools|top|toray|toshiba|total|tours|town" + + "|toyota|toys|trade|trading|training|travel|travelers|travelersinsurance|trust|trv|tube" + + "|tui|tunes|tushu|tvs|t[cdfghjklmnortvwz])" + + "|(?:ubank|ubs|unicom|university|uno|uol|ups|u[agksyz])" + + "|(?:vacations|vana|vanguard|vegas|ventures|verisign|versicherung|vet|viajes|video" + + "|vig|viking|villas|vin|vip|virgin|visa|vision|viva|vivo|vlaanderen|vodka|volkswagen" + + "|volvo|vote|voting|voto|voyage|v[aceginu])" + + "|(?:wales|walmart|walter|wang|wanggou|watch|watches|weather|weatherchannel|webcam" + + "|weber|website|wed|wedding|weibo|weir|whoswho|wien|wiki|williamhill|win|windows|wine" + + "|winners|wme|wolterskluwer|woodside|work|works|world|wow|wtc|wtf|w[fs])" + + "|(?:\u03b5\u03bb|\u03b5\u03c5|\u0431\u0433|\u0431\u0435\u043b|\u0434\u0435\u0442\u0438" + + "|\u0435\u044e|\u043a\u0430\u0442\u043e\u043b\u0438\u043a|\u043a\u043e\u043c|\u043c\u043a\u0434" + + "|\u043c\u043e\u043d|\u043c\u043e\u0441\u043a\u0432\u0430|\u043e\u043d\u043b\u0430\u0439\u043d" + + "|\u043e\u0440\u0433|\u0440\u0443\u0441|\u0440\u0444|\u0441\u0430\u0439\u0442|\u0441\u0440\u0431" + + "|\u0443\u043a\u0440|\u049b\u0430\u0437|\u0570\u0561\u0575|\u05d9\u05e9\u05e8\u05d0\u05dc" + + "|\u05e7\u05d5\u05dd|\u0627\u0628\u0648\u0638\u0628\u064a|\u0627\u062a\u0635\u0627\u0644\u0627\u062a" + + "|\u0627\u0631\u0627\u0645\u0643\u0648|\u0627\u0644\u0627\u0631\u062f\u0646|\u0627\u0644\u0628\u062d\u0631\u064a\u0646" + + "|\u0627\u0644\u062c\u0632\u0627\u0626\u0631|\u0627\u0644\u0633\u0639\u0648\u062f\u064a\u0629" + + "|\u0627\u0644\u0639\u0644\u064a\u0627\u0646|\u0627\u0644\u0645\u063a\u0631\u0628|\u0627\u0645\u0627\u0631\u0627\u062a" + + "|\u0627\u06cc\u0631\u0627\u0646|\u0628\u0627\u0631\u062a|\u0628\u0627\u0632\u0627\u0631" + + "|\u0628\u064a\u062a\u0643|\u0628\u06be\u0627\u0631\u062a|\u062a\u0648\u0646\u0633|\u0633\u0648\u062f\u0627\u0646" + + "|\u0633\u0648\u0631\u064a\u0629|\u0634\u0628\u0643\u0629|\u0639\u0631\u0627\u0642|\u0639\u0631\u0628" + + "|\u0639\u0645\u0627\u0646|\u0641\u0644\u0633\u0637\u064a\u0646|\u0642\u0637\u0631|\u0643\u0627\u062b\u0648\u0644\u064a\u0643" + + "|\u0643\u0648\u0645|\u0645\u0635\u0631|\u0645\u0644\u064a\u0633\u064a\u0627|\u0645\u0648\u0631\u064a\u062a\u0627\u0646\u064a\u0627" + + "|\u0645\u0648\u0642\u0639|\u0647\u0645\u0631\u0627\u0647|\u067e\u0627\u06a9\u0633\u062a\u0627\u0646" + + "|\u0680\u0627\u0631\u062a|\u0915\u0949\u092e|\u0928\u0947\u091f|\u092d\u093e\u0930\u0924" + + "|\u092d\u093e\u0930\u0924\u092e\u094d|\u092d\u093e\u0930\u094b\u0924|\u0938\u0902\u0917\u0920\u0928" + + "|\u09ac\u09be\u0982\u09b2\u09be|\u09ad\u09be\u09b0\u09a4|\u09ad\u09be\u09f0\u09a4|\u0a2d\u0a3e\u0a30\u0a24" + + "|\u0aad\u0abe\u0ab0\u0aa4|\u0b2d\u0b3e\u0b30\u0b24|\u0b87\u0ba8\u0bcd\u0ba4\u0bbf\u0baf\u0bbe" + + "|\u0b87\u0bb2\u0b99\u0bcd\u0b95\u0bc8|\u0b9a\u0bbf\u0b99\u0bcd\u0b95\u0baa\u0bcd\u0baa\u0bc2\u0bb0\u0bcd" + + "|\u0c2d\u0c3e\u0c30\u0c24\u0c4d|\u0cad\u0cbe\u0cb0\u0ca4|\u0d2d\u0d3e\u0d30\u0d24\u0d02" + + "|\u0dbd\u0d82\u0d9a\u0dcf|\u0e04\u0e2d\u0e21|\u0e44\u0e17\u0e22|\u0ea5\u0eb2\u0ea7|\u10d2\u10d4" + + "|\u307f\u3093\u306a|\u30a2\u30de\u30be\u30f3|\u30af\u30e9\u30a6\u30c9|\u30b0\u30fc\u30b0\u30eb" + + "|\u30b3\u30e0|\u30b9\u30c8\u30a2|\u30bb\u30fc\u30eb|\u30d5\u30a1\u30c3\u30b7\u30e7\u30f3" + + "|\u30dd\u30a4\u30f3\u30c8|\u4e16\u754c|\u4e2d\u4fe1|\u4e2d\u56fd|\u4e2d\u570b|\u4e2d\u6587\u7f51" + + "|\u4e9a\u9a6c\u900a|\u4f01\u4e1a|\u4f5b\u5c71|\u4fe1\u606f|\u5065\u5eb7|\u516b\u5366" + + "|\u516c\u53f8|\u516c\u76ca|\u53f0\u6e7e|\u53f0\u7063|\u5546\u57ce|\u5546\u5e97|\u5546\u6807" + + "|\u5609\u91cc|\u5609\u91cc\u5927\u9152\u5e97|\u5728\u7ebf|\u5927\u62ff|\u5929\u4e3b\u6559" + + "|\u5a31\u4e50|\u5bb6\u96fb|\u5e7f\u4e1c|\u5fae\u535a|\u6148\u5584|\u6211\u7231\u4f60" + + "|\u624b\u673a|\u62db\u8058|\u653f\u52a1|\u653f\u5e9c|\u65b0\u52a0\u5761|\u65b0\u95fb" + + "|\u65f6\u5c1a|\u66f8\u7c4d|\u673a\u6784|\u6de1\u9a6c\u9521|\u6e38\u620f|\u6fb3\u9580" + + "|\u70b9\u770b|\u79fb\u52a8|\u7ec4\u7ec7\u673a\u6784|\u7f51\u5740|\u7f51\u5e97|\u7f51\u7ad9" + + "|\u7f51\u7edc|\u8054\u901a|\u8c37\u6b4c|\u8d2d\u7269|\u901a\u8ca9|\u96c6\u56e2|\u96fb\u8a0a\u76c8\u79d1" + + "|\u98de\u5229\u6d66|\u98df\u54c1|\u9910\u5385|\u9999\u683c\u91cc\u62c9|\u9999\u6e2f" + + "|\ub2f7\ub137|\ub2f7\ucef4|\uc0bc\uc131|\ud55c\uad6d" + + "|xbox|xerox|xfinity|xihuan|xin|xn\\-\\-11b4c3d|xn\\-\\-1ck2e1b|xn\\-\\-1qqw23a|xn\\-\\-2scrj9c" + + "|xn\\-\\-30rr7y|xn\\-\\-3bst00m|xn\\-\\-3ds443g|xn\\-\\-3e0b707e|xn\\-\\-3hcrj9c|xn\\-\\-3pxu8k" + + "|xn\\-\\-42c2d9a|xn\\-\\-45br5cyl|xn\\-\\-45brj9c|xn\\-\\-45q11c|xn\\-\\-4dbrk0ce|xn\\-\\-4gbrim" + + "|xn\\-\\-54b7fta0cc|xn\\-\\-55qw42g|xn\\-\\-55qx5d|xn\\-\\-5su34j936bgsg|xn\\-\\-5tzm5g" + + "|xn\\-\\-6frz82g|xn\\-\\-6qq986b3xl|xn\\-\\-80adxhks|xn\\-\\-80ao21a|xn\\-\\-80aqecdr1a" + + "|xn\\-\\-80asehdb|xn\\-\\-80aswg|xn\\-\\-8y0a063a|xn\\-\\-90a3ac|xn\\-\\-90ae|xn\\-\\-90ais" + + "|xn\\-\\-9dbq2a|xn\\-\\-9et52u|xn\\-\\-9krt00a|xn\\-\\-b4w605ferd|xn\\-\\-bck1b9a5dre4c" + + "|xn\\-\\-c1avg|xn\\-\\-c2br7g|xn\\-\\-cck2b3b|xn\\-\\-cckwcxetd|xn\\-\\-cg4bki|xn\\-\\-clchc0ea0b2g2a9gcd" + + "|xn\\-\\-czr694b|xn\\-\\-czrs0t|xn\\-\\-czru2d|xn\\-\\-d1acj3b|xn\\-\\-d1alf|xn\\-\\-e1a4c" + + "|xn\\-\\-eckvdtc9d|xn\\-\\-efvy88h|xn\\-\\-fct429k|xn\\-\\-fhbei|xn\\-\\-fiq228c5hs" + + "|xn\\-\\-fiq64b|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fjq720a|xn\\-\\-flw351e|xn\\-\\-fpcrj9c3d" + + "|xn\\-\\-fzc2c9e2c|xn\\-\\-fzys8d69uvgm|xn\\-\\-g2xx48c|xn\\-\\-gckr3f0f|xn\\-\\-gecrj9c" + + "|xn\\-\\-gk3at1e|xn\\-\\-h2breg3eve|xn\\-\\-h2brj9c|xn\\-\\-h2brj9c8c|xn\\-\\-hxt814e" + + "|xn\\-\\-i1b6b1a6a2e|xn\\-\\-imr513n|xn\\-\\-io0a7i|xn\\-\\-j1aef|xn\\-\\-j1amh|xn\\-\\-j6w193g" + + "|xn\\-\\-jlq480n2rg|xn\\-\\-jvr189m|xn\\-\\-kcrx77d1x4a|xn\\-\\-kprw13d|xn\\-\\-kpry57d" + + "|xn\\-\\-kput3i|xn\\-\\-l1acc|xn\\-\\-lgbbat1ad8j|xn\\-\\-mgb9awbf|xn\\-\\-mgba3a3ejt" + + "|xn\\-\\-mgba3a4f16a|xn\\-\\-mgba7c0bbn0a|xn\\-\\-mgbaakc7dvf|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbab2bd" + + "|xn\\-\\-mgbah1a3hjkrd|xn\\-\\-mgbai9azgqp6j|xn\\-\\-mgbayh7gpa|xn\\-\\-mgbbh1a|xn\\-\\-mgbbh1a71e" + + "|xn\\-\\-mgbc0a9azcg|xn\\-\\-mgbca7dzdo|xn\\-\\-mgbcpq6gpa1a|xn\\-\\-mgberp4a5d4ar|xn\\-\\-mgbgu82a" + + "|xn\\-\\-mgbi4ecexp|xn\\-\\-mgbpl2fh|xn\\-\\-mgbt3dhd|xn\\-\\-mgbtx2b|xn\\-\\-mgbx4cd0ab" + + "|xn\\-\\-mix891f|xn\\-\\-mk1bu44c|xn\\-\\-mxtq1m|xn\\-\\-ngbc5azd|xn\\-\\-ngbe9e0a|xn\\-\\-ngbrx" + + "|xn\\-\\-node|xn\\-\\-nqv7f|xn\\-\\-nqv7fs00ema|xn\\-\\-nyqy26a|xn\\-\\-o3cw4h|xn\\-\\-ogbpf8fl" + + "|xn\\-\\-otu796d|xn\\-\\-p1acf|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-pssy2u|xn\\-\\-q7ce6a" + + "|xn\\-\\-q9jyb4c|xn\\-\\-qcka1pmc|xn\\-\\-qxa6a|xn\\-\\-qxam|xn\\-\\-rhqv96g|xn\\-\\-rovu88b" + + "|xn\\-\\-rvc1e0am3e|xn\\-\\-s9brj9c|xn\\-\\-ses554g|xn\\-\\-t60b56a|xn\\-\\-tckwe|xn\\-\\-tiq49xqyj" + + "|xn\\-\\-unup4y|xn\\-\\-vermgensberater\\-ctb|xn\\-\\-vermgensberatung\\-pwb|xn\\-\\-vhquv" + + "|xn\\-\\-vuq861b|xn\\-\\-w4r85el8fhu5dnra|xn\\-\\-w4rs40l|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a" + + "|xn\\-\\-xhq521b|xn\\-\\-xkc2al3hye2a|xn\\-\\-xkc2dl3a5ee0h|xn\\-\\-y9a3aq|xn\\-\\-yfro4i67o" + + "|xn\\-\\-ygbi2ammx|xn\\-\\-zfr164b|xxx|xyz)" + + "|(?:yachts|yahoo|yamaxun|yandex|yodobashi|yoga|yokohama|you|youtube|yun|y[et])" + + "|(?:zappos|zara|zero|zip|zone|zuerich|z[amw]))") + + // language=RegExp + private const val IP_ADDRESS = ("((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(25[0-5]|2[0-4]" + + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]" + + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" + + "|[1-9][0-9]|[0-9]))") + + /** + * Valid UCS characters defined in RFC 3987. Excludes space characters. + */ + // language=RegExp + private const val UCS_CHAR = "[" + + "\u00A0-\uD7FF" + + "\uF900-\uFDCF" + + "\uFDF0-\uFFEF" + + "\uD800\uDC00-\uD83F\uDFFD" + + "\uD840\uDC00-\uD87F\uDFFD" + + "\uD880\uDC00-\uD8BF\uDFFD" + + "\uD8C0\uDC00-\uD8FF\uDFFD" + + "\uD900\uDC00-\uD93F\uDFFD" + + "\uD940\uDC00-\uD97F\uDFFD" + + "\uD980\uDC00-\uD9BF\uDFFD" + + "\uD9C0\uDC00-\uD9FF\uDFFD" + + "\uDA00\uDC00-\uDA3F\uDFFD" + + "\uDA40\uDC00-\uDA7F\uDFFD" + + "\uDA80\uDC00-\uDABF\uDFFD" + + "\uDAC0\uDC00-\uDAFF\uDFFD" + + "\uDB00\uDC00-\uDB3F\uDFFD" + + "\uDB44\uDC00-\uDB7F\uDFFD" + + "&&[^\u00A0[\u2000-\u200A]\u2028\u2029\u202F\u3000]]" + + /** + * Valid characters for IRI label defined in RFC 3987. + */ + // language=RegExp + private const val LABEL_CHAR = """a-zA-Z0-9$UCS_CHAR""" + + /** + * Valid characters for IRI TLD defined in RFC 3987. + */ + // language=RegExp + private const val TLD_CHAR = """a-zA-Z$UCS_CHAR""" + + /** + * RFC 1035 Section 2.3.4 limits the labels to a maximum 63 octets. + */ + // language=RegExp + private const val IRI_LABEL = """[$LABEL_CHAR](?:[${LABEL_CHAR}_\-]{0,61}[$LABEL_CHAR]){0,1}""" + + /** + * RFC 3492 references RFC 1034 and limits Punycode algorithm output to 63 characters. + */ + // language=RegExp + private const val PUNYCODE_TLD = """xn\-\-[\w\-]{0,58}\w""" + + // language=RegExp + private const val TLD = """($PUNYCODE_TLD|[$TLD_CHAR]{2,63})""" + + // language=RegExp + private const val HOST_NAME = """($IRI_LABEL\.)+$TLD""" + + // language=RegExp + private const val DOMAIN_NAME = """($HOST_NAME|$IP_ADDRESS)""" + + // language=RegExp + private const val PROTOCOL = "(?i:http|https|rtsp)://" + + /* A word boundary or end of input. This is to stop foo.sure from matching as foo.su */ + // NOTE: We've modified the word boundary matcher to add (?=\s) to match trailing slashes + // language=RegExp + private const val WORD_BOUNDARY = "(?:\\b|$|^|(?=\\s))" + + // language=RegExp + private const val USER_INFO = ("(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@") + + // language=RegExp + private const val PORT_NUMBER = "\\:\\d{1,5}" + + // language=RegExp + private const val PATH_AND_QUERY = """[/\?](?:(?:[$LABEL_CHAR;/\?:@&=#~\-\.\+!\*'\(\),_\$])|(?:%[a-fA-F0-9]{2}))*""" + + /** + * Regular expression that matches known TLDs and punycode TLDs + */ + // language=RegExp + private const val STRICT_TLD = """(?:$IANA_TOP_LEVEL_DOMAINS|$PUNYCODE_TLD)""" + + /** + * Regular expression that matches host names using [.STRICT_TLD] + */ + // language=RegExp + private const val STRICT_HOST_NAME = """(?:(?:$IRI_LABEL\.)+$STRICT_TLD)""" + + /** + * Regular expression that matches domain names using either [.STRICT_HOST_NAME] or + * [.IP_ADDRESS] + */ + // language=RegExp + private const val STRICT_DOMAIN_NAME = """(?:$STRICT_HOST_NAME|$IP_ADDRESS)""" + + /** + * Regular expression that matches domain names without a TLD + */ + // language=RegExp + private const val RELAXED_DOMAIN_NAME = """(?:(?:$IRI_LABEL(?:\.(?=\S))?)+|$IP_ADDRESS)""" + + /** + * Regular expression to match strings that do not start with a supported protocol. The TLDs + * are expected to be one of the known TLDs. + */ + // language=RegExp + private const val WEB_URL_WITHOUT_PROTOCOL = ("(" + + WORD_BOUNDARY + + "(?(value) + assertEquals(roomId, value.roomId!!.full) + assertEquals(eventId, value.eventId.full) } else { - result.size shouldBe 0 + assertEquals( + expected = emptyList(), + actual = result.values.toList(), + ) } } } @@ -537,16 +231,6 @@ class MatrixRegexTest : TrixnityBaseTest() { UriTest.user("matrix:u/user:example.com?action=chat", "user", "example.com", expected = true) } - @Test - fun shouldPassUserURIWithinAnchorTagWithActionQuery() { - UriTest.user( - "Alice", - "alice", - "example.org", - expected = true - ) - } - @Test fun shouldPassUserURIWithViaQuery() { UriTest.user("matrix:u/user:example.com?via=example.com", "user", "example.com", expected = true) @@ -567,16 +251,6 @@ class MatrixRegexTest : TrixnityBaseTest() { UriTest.user("matrix:u/user:example.com", "user", "example.com", expected = true) } - @Test - fun shouldPassUriURIWithinAnchorTag() { - UriTest.user( - "Dr. Karl Tanaka (Demo Bot)", - "demobot8", - "demo.example.de", - expected = true - ) - } - // URIs: Room Alias @Test fun shouldPassRoomAliasURIWithActionQuery() { @@ -625,13 +299,13 @@ class MatrixRegexTest : TrixnityBaseTest() { } @Test - fun shouldFailRoomIdURIWithIllegalQuery() { - UriTest.roomId("matrix:roomid/room:example.com?actioné=messager", "!room:example.com", expected = false) + fun shouldPassRoomIdURIWithIllegalQuery() { + UriTest.roomId("matrix:roomid/room:example.com?actioné=messager", "!room:example.com", expected = true) } @Test - fun shouldFailRoomIdURIWithReservedQuery() { - UriTest.roomId("matrix:roomid/room:example.com?m.action=join", "!room:example.com", expected = false) + fun shouldPassRoomIdURIWithReservedQuery() { + UriTest.roomId("matrix:roomid/room:example.com?m.action=join", "!room:example.com", expected = true) } @Test @@ -711,15 +385,16 @@ class MatrixRegexTest : TrixnityBaseTest() { // Permalinks (matrix.to) object PermalinkTest { fun user(permalink: String, localpart: String, domain: String, expected: Boolean) { - val result = findMentions("Hello $permalink :D") + val text = "Hello $permalink :D" + val result = findMentions(text) - result.values.any { - it.match == permalink + result.keys.any { + text.substring(it) == permalink } shouldBe expected if (expected) { result.size shouldBe 1 - (result.entries.first { it.value.match == permalink }.value as Mention.User).userId shouldBe UserId( + (result.entries.first { text.substring(it.key) == permalink }.value as Mention.User).userId shouldBe UserId( localpart, domain ) @@ -729,15 +404,16 @@ class MatrixRegexTest : TrixnityBaseTest() { } fun roomId(permalink: String, roomId: String, expected: Boolean) { - val result = findMentions("omw to $permalink now") + val text = "omw to $permalink now" + val result = findMentions(text) - result.values.any { - it.match == permalink + result.keys.any { + text.substring(it) == permalink } shouldBe expected if (expected) { result.size shouldBe 1 - (result.entries.first { it.value.match == permalink }.value as Mention.Room).roomId shouldBe + (result.entries.first { text.substring(it.key) == permalink }.value as Mention.Room).roomId shouldBe RoomId(roomId) } else { result.size shouldBe 0 @@ -745,15 +421,16 @@ class MatrixRegexTest : TrixnityBaseTest() { } fun roomAlias(permalink: String, localpart: String, domain: String, expected: Boolean) { - val result = findMentions("omw to $permalink now") + val text = "omw to $permalink now" + val result = findMentions(text) - result.values.any { - it.match == permalink + result.keys.any { + text.substring(it) == permalink } shouldBe expected if (expected) { result.size shouldBe 1 - (result.entries.first { it.value.match == permalink }.value as Mention.RoomAlias).roomAliasId shouldBe RoomAliasId( + (result.entries.first { text.substring(it.key) == permalink }.value as Mention.RoomAlias).roomAliasId shouldBe RoomAliasId( localpart, domain ) @@ -763,16 +440,17 @@ class MatrixRegexTest : TrixnityBaseTest() { } fun event(permalink: String, roomId: String, eventId: String, expected: Boolean) { - val result = findMentions("You can find it at $permalink :)") + val text = "You can find it at $permalink :)" + val result = findMentions(text) - result.values.any { - it.match == permalink + result.keys.any { + text.substring(it) == permalink } shouldBe expected if (expected) { result.size shouldBe 1 - val mention = result.entries.first { it.value.match == permalink }.value + val mention = result.entries.first { text.substring(it.key) == permalink }.value if (mention !is Mention.Event) { fail("Wrong Mention type") } else { @@ -786,25 +464,6 @@ class MatrixRegexTest : TrixnityBaseTest() { } // Permalink: User ID - @Test - fun shouldPassUserPermalinkWithinAnchorTag() { - PermalinkTest.user( - "Hallo", - "user", - "example.com", - expected = true - ) - } - - @Test - fun shouldPassEncodedUserPermalinkWithinAnchorTag() { - PermalinkTest.user( - "Hallo", - "user", - "example.com", - expected = true - ) - } @Test fun shouldPassUserPermalink() { @@ -816,44 +475,7 @@ class MatrixRegexTest : TrixnityBaseTest() { PermalinkTest.user("https://matrix.to/#/%40alice%3Aexample.org", "alice", "example.org", expected = true) } - @Test - fun shouldPassUsersPermalinksWithinAnchorTag() { - val karl = "Dr. Karl Tanaka (Demo Bot)" - val wolfgang = - "Dr. Wolfgang Reidorf (Demo Bot)" - - val message = "$karl und $wolfgang wie geht's euch?" - - val result = findMentions(message) - result.size shouldBe 2 - - result.values.any { - it.match == karl - } shouldBe true - (result.entries.first { it.value.match == karl }.value as Mention.User).userId shouldBe UserId( - "demobot8", - "demo.example.de" - ) - - result.values.any { - it.match == wolfgang - } shouldBe true - (result.entries.first { it.value.match == wolfgang }.value as Mention.User).userId shouldBe UserId( - "demobot2", - "demo.example.de" - ) - } - // Permalink: Room Alias - @Test - fun shouldPassRoomAliasPermalinkWithinAnchorTag() { - PermalinkTest.roomAlias( - "Hallo", - "room", - "example.com", - expected = true - ) - } @Test fun shouldPassRoomAliasPermalink() { @@ -871,14 +493,6 @@ class MatrixRegexTest : TrixnityBaseTest() { } // Permalink: Room ID - @Test - fun shouldPassRoomIdPermalinkWithinAnchorTag() { - PermalinkTest.roomId( - "Hallo", - "!room:example.com", - expected = true - ) - } @Test fun shouldPassRoomIdPermalink() { @@ -895,16 +509,6 @@ class MatrixRegexTest : TrixnityBaseTest() { } // Permalink: Event ID - @Test - fun shouldPassEventIDPermalinkWithinAnchorTag() { - PermalinkTest.event( - "Hallo", - "!room:example.com", - "\$event", - expected = true - ) - } - @Test fun shouldPassEventIDPermalink() { PermalinkTest.event( @@ -920,7 +524,7 @@ class MatrixRegexTest : TrixnityBaseTest() { PermalinkTest.event( "https://matrix.to/#/!room%3Aexample.com/%24event%3Aexample.org?via=elsewhere.ca", "!room:example.com", - "\$event%3Aexample.org", + "\$event:example.org", expected = true ) } @@ -934,19 +538,11 @@ class MatrixRegexTest : TrixnityBaseTest() { } } - fun makeParameters(params: Map): Parameters { - return Parameters.build { - params.forEach { (key, value) -> - this.append(key, value) - } - } - } - @Test fun shouldPassValidViaParameter() { parameterTest( "matrix:roomid/somewhere%3Aexample.org/%24event%3Aexample.org?via=elsewhere.ca", - makeParameters(mapOf("via" to "elsewhere.ca")), + parametersOf("via" to listOf("elsewhere.ca")), expected = true ) } @@ -955,7 +551,7 @@ class MatrixRegexTest : TrixnityBaseTest() { fun shouldPassActionParameter() { parameterTest( "matrix:roomid/room:example.com/e/event?via=example.com&action=join", - makeParameters(mapOf("action" to "join", "via" to "example.com")), + parametersOf("action" to listOf("join"), "via" to listOf("example.com")), expected = true ) } @@ -964,7 +560,7 @@ class MatrixRegexTest : TrixnityBaseTest() { fun shouldPassActionAndViaParameter() { parameterTest( "matrix:roomid/somewhere%3Aexample.org?action=chat&via=example.com", - makeParameters(mapOf("action" to "chat", "via" to "example.com")), + parametersOf("action" to listOf("chat"), "via" to listOf("example.com")), expected = true ) } @@ -973,90 +569,81 @@ class MatrixRegexTest : TrixnityBaseTest() { fun shouldPassCustomParameter() { parameterTest( "matrix:r/somewhere:example.org?foo=bar", - makeParameters(mapOf("foo" to "bar")), + parametersOf("foo" to listOf("bar")), expected = true ) } @Test - fun shouldFailCustomParameterWithIllegalCharacter() { + fun shouldParseCustomParameterWithIllegalCharacter() { parameterTest( "matrix:u/mario:esempio.it?actionaté=mammamia", - makeParameters(mapOf("actionaté" to "mammamia")), - expected = false + parametersOf("actionaté" to listOf("mammamia")), + expected = true ) } @Test - fun shouldFailCustomParameterWithIllegalStart() { + fun shouldParseCustomParameterWithIllegalStart() { parameterTest( "matrix:u/user:homeserver.рф?m.vector=matrix", - makeParameters(mapOf("m.vector" to "matrix")), - expected = false + parametersOf("m.vector" to listOf("matrix")), + expected = true ) } @Test - fun shouldPassCustomParametersWithLastOneBeingIllegal() { + fun shouldParseCustomParametersWithLastOneBeingIllegal() { parameterTest( "matrix:u/user:example.com?foo=bar&actionaté=mammamia", - makeParameters(mapOf("foo" to "bar")), + parametersOf("foo" to listOf("bar"), "actionaté" to listOf("mammamia")), expected = true ) } - // Negative Edgecase - private fun negativeTest(id: String, matcher: Regex? = null) { - val message = "Hello $id :D" - - val result = matcher?.findAll(message)?.toList()?.size - ?: findMentions(message).size - - result shouldBe 0 - } - - // Negative Edgecase: User ID - @Test - fun notMatchIncompleteUserHtmlTag() { - negativeTest("""User", MatrixRegex.userIdPermalinkAnchor) - } - - // Negative Edgecase: Anchors - @Test - fun notMatchIncompleteRoomAliasHtmlTag() { - negativeTest("""Room", MatrixRegex.roomAliasPermalinkAnchor) - } - - @Test - fun notMatchIncompleteRoomIdHtmlTag() { - negativeTest("""User", MatrixRegex.roomIdPermalinkAnchor) - } - - @Test - fun notMatchIncompleteEventIdHtmlTag() { - negativeTest("Event", - MatrixRegex.eventIdPermalinkAnchor + fun `ignores overlaps`() { + val content = "lorem @user:example.org ipsum https://matrix.to/#/@user:example.org?action=chat dolor matrix:u/user:example.org sit" + // Links + assertEquals( + expected = "https://matrix.to/#/@user:example.org?action=chat", + actual = content.substring(30..78), + ) + assertEquals( + expected = "matrix:u/user:example.org", + actual = content.substring(86..110), + ) + assertEquals( + expected = mapOf( + 30..78 to Mention.User(UserId("@user:example.org"), parametersOf("action", "chat")), + 86..110 to Mention.User(UserId("@user:example.org")), + ), + actual = MatrixRegex.findLinkMentions(content) + ) + // Ids + assertEquals( + expected = "@user:example.org", + actual = content.substring(6..22), + ) + assertEquals( + expected = "@user:example.org", + actual = content.substring(50..66), + ) + assertEquals( + expected = mapOf( + 6..22 to Mention.User(UserId("@user:example.org")), + 50..66 to Mention.User(UserId("@user:example.org")), + ), + actual = MatrixRegex.findIdMentions(content) + ) + // Combined + assertEquals( + expected = mapOf( + 30..78 to Mention.User(UserId("@user:example.org"), parametersOf("action", "chat")), + 86..110 to Mention.User(UserId("@user:example.org")), + 6..22 to Mention.User(UserId("@user:example.org")), + ), + actual = MatrixRegex.findMentions(content) ) } } -- GitLab From 0740b314e2a88fcba9f269ceca918c4edf41cd5b Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Mon, 21 Jul 2025 10:26:51 +0200 Subject: [PATCH 02/11] Improve error logging --- .../folivo/trixnity/core/util/MatrixLinks.kt | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt index d5aaa7a66..ee8254eb9 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt @@ -1,5 +1,6 @@ package net.folivo.trixnity.core.util +import io.github.oshai.kotlinlogging.KotlinLogging import io.ktor.http.* import net.folivo.trixnity.core.model.EventId import net.folivo.trixnity.core.model.Mention @@ -7,6 +8,8 @@ import net.folivo.trixnity.core.model.RoomAliasId import net.folivo.trixnity.core.model.RoomId import net.folivo.trixnity.core.model.UserId +private val log = KotlinLogging.logger {} + object MatrixLinks { private val matrixProtocol = URLProtocol("matrix", 0) @@ -28,12 +31,18 @@ object MatrixLinks { private fun parseMatrixTo(path: List, parameters: Parameters): Mention? { val parts = path.map { id -> when { - id.length > 255 -> null + id.length > 255 -> { + log.trace { "malformed matrix link: id too long: ${id.length} (max length: 255)" } + null + } id.startsWith(RoomAliasId.sigilCharacter) -> RoomAliasId(id) id.startsWith(RoomId.sigilCharacter) -> RoomId(id) id.startsWith(UserId.sigilCharacter) -> UserId(id) id.startsWith(EventId.sigilCharacter) -> EventId(id) - else -> null + else -> { + log.trace { "malformed matrix link: invalid id type: ${id.firstOrNull()} (known types: #, !, @, $)" } + null + } } } val first = parts.getOrNull(0) @@ -44,19 +53,28 @@ object MatrixLinks { first is EventId -> Mention.Event(null, first, parameters) first is RoomId && second is EventId -> Mention.Event(first, second, parameters) first is RoomId -> Mention.Room(first, parameters) - else -> null + else -> { + log.trace { "malformed matrix link: unknown format" } + null + } } } private fun parseMatrixProtocol(path: List, parameters: Parameters): Mention? { val parts = path.windowed(2, 2).map { (type, id) -> when { - id.length > 255 -> null + id.length > 255 -> { + log.trace { "malformed matrix link: id too long: ${id.length} (max length: 255)" } + null + } type == "roomid" -> RoomId("!$id") type == "r" -> RoomAliasId("#$id") type == "u" -> UserId("@$id") type == "e" -> EventId("$$id") - else -> null + else -> { + log.trace { "malformed matrix link: invalid id type: $type (known types: roomid, r, u, e)" } + null + } } } val first = parts.getOrNull(0) @@ -66,7 +84,10 @@ object MatrixLinks { first is RoomAliasId -> Mention.RoomAlias(first, parameters) first is RoomId && second is EventId -> Mention.Event(first, second, parameters) first is RoomId -> Mention.Room(first, parameters) - else -> null + else -> { + log.trace { "malformed matrix link: unknown format" } + null + } } } } \ No newline at end of file -- GitLab From 51d73328f5c4a0a910f9541d87cc6cdccdee5de4 Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Mon, 21 Jul 2025 10:27:14 +0200 Subject: [PATCH 03/11] Add comments for non-obvious behaviour --- .../net/folivo/trixnity/core/MatrixRegex.kt | 22 ++++++++++++------- .../folivo/trixnity/core/util/MatrixLinks.kt | 6 +++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt index 5a0baddc5..75b0b9a9f 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt @@ -16,14 +16,9 @@ object MatrixRegex { val users = findIdMentions(message) val linksRange = links.keys.sortedBy { it.first } val uniqueUsers = users.filter { (user, _) -> - val index = linksRange.binarySearch { link -> - when { - link.first > user.first -> -1 - link.last < user.last -> 1 - else -> 0 - } - } - index < 0 + // We don't want id matches that overlap with link matches, + // as matrix.to urls will match both as link and as id + !linksRange.overlaps(user) } return links.plus(uniqueUsers).toMap() } @@ -48,4 +43,15 @@ object MatrixRegex { else -> null } } + + private fun List.overlaps(match: IntRange): Boolean { + val index = binarySearch { other -> + when { + other.first > match.first -> -1 + other.last < match.last -> 1 + else -> 0 + } + } + return index >= 0 + } } \ No newline at end of file diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt index ee8254eb9..2e553a8f6 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt @@ -18,7 +18,13 @@ object MatrixLinks { if (url.protocol == matrixProtocol) { return parseMatrixProtocol(url.segments, url.parameters) } + // matrix.to URLs look like this: + // https://matrix.to/#/!roomId?via=example.org + // protocol=https host=matrix.to segments=[] fragment=/!roomId?via=example.org if (url.protocol == URLProtocol.HTTPS && url.host == "matrix.to" && url.segments.isEmpty()) { + // matrix.to uses AJAX hash routing, where the entire path is passed within the hash fragment to prevent + // the server from seeing the roomId. + // This means we have to parse this hash back into path segments and query parameters val path = url.fragment.substringBefore('?').removePrefix("/") val query = url.fragment.substringAfter('?', missingDelimiterValue = "") val segments = path.removePrefix("/").split('/') -- GitLab From beca2261872ea524ad24efb106dbac4f939041da Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Mon, 21 Jul 2025 10:27:28 +0200 Subject: [PATCH 04/11] Fix illegal characters in test names --- .../kotlin/net/folivo/trixnity/core/MatrixLinkTest.kt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixLinkTest.kt b/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixLinkTest.kt index 69781eb82..e02eae9ec 100644 --- a/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixLinkTest.kt +++ b/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixLinkTest.kt @@ -136,7 +136,7 @@ class MatrixLinkTest { } @Test - fun `parses matrixto roomid (v12) links`() { + fun `parses matrixto roomid v12 links`() { assertEquals( expected = Mention.Room(RoomId("!NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE")), actual = MatrixLinks.parse("https://matrix.to/#/!NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE") @@ -162,7 +162,7 @@ class MatrixLinkTest { } @Test - fun `parses matrix protocol roomid (v12) links`() { + fun `parses matrix protocol roomid v12 links`() { assertEquals( expected = Mention.Room(RoomId("!NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE")), actual = MatrixLinks.parse("matrix:roomid/NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE") @@ -240,7 +240,7 @@ class MatrixLinkTest { } @Test - fun `parses matrixto event (v12) links`() { + fun `parses matrixto event v12 links`() { assertEquals( expected = Mention.Event(RoomId("!NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE"), EventId("\$NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE")), actual = MatrixLinks.parse("https://matrix.to/#/!NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE/\$NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE") @@ -288,7 +288,7 @@ class MatrixLinkTest { } @Test - fun `parses matrix protocol event (v12) links`() { + fun `parses matrix protocol event v12 links`() { assertEquals( expected = Mention.Event(RoomId("!NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE"), EventId("\$NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE")), actual = MatrixLinks.parse("matrix:roomid/NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE/e/NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE") -- GitLab From 47e5eacba144cd5cf65540a3a080c5297c1448da Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Mon, 21 Jul 2025 10:59:11 +0200 Subject: [PATCH 05/11] Early return for invalid ids --- .../kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt index 2e553a8f6..fc9113ea7 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/MatrixLinks.kt @@ -39,7 +39,7 @@ object MatrixLinks { when { id.length > 255 -> { log.trace { "malformed matrix link: id too long: ${id.length} (max length: 255)" } - null + return null } id.startsWith(RoomAliasId.sigilCharacter) -> RoomAliasId(id) id.startsWith(RoomId.sigilCharacter) -> RoomId(id) @@ -71,7 +71,7 @@ object MatrixLinks { when { id.length > 255 -> { log.trace { "malformed matrix link: id too long: ${id.length} (max length: 255)" } - null + return null } type == "roomid" -> RoomId("!$id") type == "r" -> RoomAliasId("#$id") -- GitLab From 600b2a97d9c9599fd4a0e024576fc7e5d69d3948 Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Mon, 21 Jul 2025 10:59:37 +0200 Subject: [PATCH 06/11] Add additional tests for long ids --- .../folivo/trixnity/core/MatrixLinkTest.kt | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixLinkTest.kt b/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixLinkTest.kt index e02eae9ec..f390f6d20 100644 --- a/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixLinkTest.kt +++ b/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixLinkTest.kt @@ -301,4 +301,156 @@ class MatrixLinkTest { actual = MatrixLinks.parse("matrix:roomid/NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE/e/NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE?via=example.org&action=join&via=elsewhere.ca") ) } + + @Test + fun `allows long matrixto links`() { + val longId = ( + "aaaaaaaaa1aaaaaaaaa2aaaaaaaaa3aaaaaaaaa4aaaaaaaaa5aaaaaaaaa6aaaaaaaaa7aaaaaaaaa8aaaaaaaaa9aaaaaaaa10" + + "aaaaaaaa11aaaaaaaa12aaaaaaaa13aaaaaaaa14aaaaaaaa15aaaaaaaa16aaaaaaaa17aaaaaaaa18aaaaaaaa19aaaaaaaa20" + + "aaaaaaaa21aaaaaaaa22aaaaaaaa23aaaaaaaa24aa:example.org" + ) + assertEquals( + expected = 254, + actual = longId.length, + ) + assertEquals( + expected = Mention.User(UserId(UserId.sigilCharacter + longId)), + actual = MatrixLinks.parse("https://matrix.to/#/@$longId") + ) + assertEquals( + expected = Mention.RoomAlias(RoomAliasId(RoomAliasId.sigilCharacter + longId)), + actual = MatrixLinks.parse("https://matrix.to/#/#$longId") + ) + assertEquals( + expected = Mention.Room(RoomId(RoomId.sigilCharacter + longId)), + actual = MatrixLinks.parse("https://matrix.to/#/!$longId") + ) + assertEquals( + expected = Mention.Event(RoomId(RoomId.sigilCharacter + longId), EventId(EventId.sigilCharacter + "NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE")), + actual = MatrixLinks.parse("https://matrix.to/#/!$longId/\$NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE") + ) + assertEquals( + expected = Mention.Event(RoomId(RoomId.sigilCharacter + "NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE"), EventId(EventId.sigilCharacter + longId)), + actual = MatrixLinks.parse("https://matrix.to/#/!NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE/$$longId") + ) + assertEquals( + expected = Mention.Event(RoomId(RoomId.sigilCharacter + longId), EventId(EventId.sigilCharacter + longId)), + actual = MatrixLinks.parse("https://matrix.to/#/!$longId/$$longId") + ) + assertEquals( + expected = Mention.Event(null, EventId(EventId.sigilCharacter + longId)), + actual = MatrixLinks.parse("https://matrix.to/#/$$longId") + ) + } + + @Test + fun `allows long matrix protocol links`() { + val longId = ( + "aaaaaaaaa1aaaaaaaaa2aaaaaaaaa3aaaaaaaaa4aaaaaaaaa5aaaaaaaaa6aaaaaaaaa7aaaaaaaaa8aaaaaaaaa9aaaaaaaa10" + + "aaaaaaaa11aaaaaaaa12aaaaaaaa13aaaaaaaa14aaaaaaaa15aaaaaaaa16aaaaaaaa17aaaaaaaa18aaaaaaaa19aaaaaaaa20" + + "aaaaaaaa21aaaaaaaa22aaaaaaaa23aaaaaaaa24aa:example.org" + ) + assertEquals( + expected = 254, + actual = longId.length, + ) + assertEquals( + expected = Mention.User(UserId(UserId.sigilCharacter + longId)), + actual = MatrixLinks.parse("matrix:u/$longId") + ) + assertEquals( + expected = Mention.RoomAlias(RoomAliasId(RoomAliasId.sigilCharacter + longId)), + actual = MatrixLinks.parse("matrix:r/$longId") + ) + assertEquals( + expected = Mention.Room(RoomId(RoomId.sigilCharacter + longId)), + actual = MatrixLinks.parse("matrix:roomid/$longId") + ) + assertEquals( + expected = Mention.Event(RoomId(RoomId.sigilCharacter + longId), EventId(EventId.sigilCharacter + "NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE")), + actual = MatrixLinks.parse("matrix:roomid/$longId/e/NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE") + ) + assertEquals( + expected = Mention.Event(RoomId(RoomId.sigilCharacter + "NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE"), EventId(EventId.sigilCharacter + longId)), + actual = MatrixLinks.parse("matrix:roomid/NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE/e/$longId") + ) + assertEquals( + expected = Mention.Event(RoomId(RoomId.sigilCharacter + longId), EventId(EventId.sigilCharacter + longId)), + actual = MatrixLinks.parse("matrix:roomid/$longId/e/$longId") + ) + } + + @Test + fun `rejects too long matrixto links`() { + val tooLongId = ( + "aaaaaaaaa1aaaaaaaaa2aaaaaaaaa3aaaaaaaaa4aaaaaaaaa5aaaaaaaaa6aaaaaaaaa7aaaaaaaaa8aaaaaaaaa9aaaaaaaa10" + + "aaaaaaaa11aaaaaaaa12aaaaaaaa13aaaaaaaa14aaaaaaaa15aaaaaaaa16aaaaaaaa17aaaaaaaa18aaaaaaaa19aaaaaaaa20" + + "aaaaaaaa21aaaaaaaa22aaaaaaaa23aaaaaaaa24aaaaaaaa25aaaaaaaa26:example.org" + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("https://matrix.to/#/@$tooLongId") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("https://matrix.to/#/#$tooLongId") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("https://matrix.to/#/!$tooLongId") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("https://matrix.to/#/!$tooLongId/\$NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("https://matrix.to/#/!NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE/e/$$tooLongId") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("https://matrix.to/#/!$tooLongId/$$tooLongId") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("https://matrix.to/#/$$tooLongId") + ) + } + + @Test + fun `rejects too long matrix protocol links`() { + val tooLongId = ( + "aaaaaaaaa1aaaaaaaaa2aaaaaaaaa3aaaaaaaaa4aaaaaaaaa5aaaaaaaaa6aaaaaaaaa7aaaaaaaaa8aaaaaaaaa9aaaaaaaa10" + + "aaaaaaaa11aaaaaaaa12aaaaaaaa13aaaaaaaa14aaaaaaaa15aaaaaaaa16aaaaaaaa17aaaaaaaa18aaaaaaaa19aaaaaaaa20" + + "aaaaaaaa21aaaaaaaa22aaaaaaaa23aaaaaaaa24aaaaaaaa25aaaaaaaa26:example.org" + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("matrix:u/$tooLongId") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("matrix:r/$tooLongId") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("matrix:roomid/$tooLongId") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("matrix:roomid/$tooLongId/e/NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("matrix:roomid/NXTQJLZfL7TpVrS6TcznngpZiiuwZcJXdr1ODlnT-sE/e/$tooLongId") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("matrix:roomid/$tooLongId/e/$tooLongId") + ) + assertEquals( + expected = null, + actual = MatrixLinks.parse("matrix:e/$tooLongId") + ) + } } -- GitLab From 5f448ab84f5d51651feadf7faccccf6c4e87cfe2 Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Mon, 21 Jul 2025 11:11:50 +0200 Subject: [PATCH 07/11] Add additional validation for long ids --- .../kotlin/net/folivo/trixnity/core/MatrixRegex.kt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt index 75b0b9a9f..66673e6b2 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt @@ -1,11 +1,14 @@ package net.folivo.trixnity.core +import io.github.oshai.kotlinlogging.KotlinLogging import net.folivo.trixnity.core.model.Mention import net.folivo.trixnity.core.model.RoomAliasId import net.folivo.trixnity.core.model.UserId import net.folivo.trixnity.core.util.MatrixLinks import net.folivo.trixnity.core.util.Patterns +private val log = KotlinLogging.logger {} + object MatrixRegex { // language=Regexp private const val ID_PATTERN = """[@#][0-9a-z\-.=_/+]+:(?:[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|\[[0-9a-fA-F:.]{2,45}]|[0-9a-zA-Z\-.]{1,255})(?::[0-9]{1,5})?""" @@ -38,6 +41,10 @@ object MatrixRegex { private fun parseMatrixId(id: String): Mention? { return when { + id.length > 255 -> { + log.trace { "malformed matrix id: id too long: ${id.length} (max length: 255)" } + null + } id.startsWith(UserId.sigilCharacter) -> Mention.User(UserId(id)) id.startsWith(RoomAliasId.sigilCharacter) -> Mention.RoomAlias(RoomAliasId(id)) else -> null -- GitLab From 2c1d6aeb4e547b3dd5c8a849dd92a871020051c8 Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Mon, 21 Jul 2025 16:13:28 +0200 Subject: [PATCH 08/11] Expose validation functions --- .../kotlin/net/folivo/trixnity/core/MatrixRegex.kt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt index 66673e6b2..89e5a49f4 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt @@ -39,6 +39,16 @@ object MatrixRegex { }.toMap() } + fun isValidUserId(id: String): Boolean = + id.length <= 255 + && id.startsWith(UserId.sigilCharacter) + && id.matches(idRegex) + + fun isValidRoomAliasId(id: String): Boolean = + id.length <= 255 + && id.startsWith(RoomAliasId.sigilCharacter) + && id.matches(idRegex) + private fun parseMatrixId(id: String): Mention? { return when { id.length > 255 -> { -- GitLab From de819aed04d16a50a6c0757172b50d464b93dc5e Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Mon, 21 Jul 2025 17:15:59 +0200 Subject: [PATCH 09/11] Fix failing regex in web --- .../net/folivo/trixnity/core/MatrixRegex.kt | 2 +- .../net/folivo/trixnity/core/util/Patterns.kt | 83 ++++++++++--------- 2 files changed, 44 insertions(+), 41 deletions(-) diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt index 89e5a49f4..51891d3b7 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt @@ -11,7 +11,7 @@ private val log = KotlinLogging.logger {} object MatrixRegex { // language=Regexp - private const val ID_PATTERN = """[@#][0-9a-z\-.=_/+]+:(?:[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|\[[0-9a-fA-F:.]{2,45}]|[0-9a-zA-Z\-.]{1,255})(?::[0-9]{1,5})?""" + private const val ID_PATTERN = """[@#][0-9a-z\-.=_/+]+:(?:[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|\[[0-9a-fA-F:.]{2,45}\]|[0-9a-zA-Z\-.]{1,255})(?::[0-9]{1,5})?""" private val idRegex = ID_PATTERN.toRegex() fun findMentions(message: String): Map { diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt index 14c40cdb2..83b15e2b8 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt @@ -158,35 +158,35 @@ internal object Patterns { + "|\u7f51\u7edc|\u8054\u901a|\u8c37\u6b4c|\u8d2d\u7269|\u901a\u8ca9|\u96c6\u56e2|\u96fb\u8a0a\u76c8\u79d1" + "|\u98de\u5229\u6d66|\u98df\u54c1|\u9910\u5385|\u9999\u683c\u91cc\u62c9|\u9999\u6e2f" + "|\ub2f7\ub137|\ub2f7\ucef4|\uc0bc\uc131|\ud55c\uad6d" - + "|xbox|xerox|xfinity|xihuan|xin|xn\\-\\-11b4c3d|xn\\-\\-1ck2e1b|xn\\-\\-1qqw23a|xn\\-\\-2scrj9c" - + "|xn\\-\\-30rr7y|xn\\-\\-3bst00m|xn\\-\\-3ds443g|xn\\-\\-3e0b707e|xn\\-\\-3hcrj9c|xn\\-\\-3pxu8k" - + "|xn\\-\\-42c2d9a|xn\\-\\-45br5cyl|xn\\-\\-45brj9c|xn\\-\\-45q11c|xn\\-\\-4dbrk0ce|xn\\-\\-4gbrim" - + "|xn\\-\\-54b7fta0cc|xn\\-\\-55qw42g|xn\\-\\-55qx5d|xn\\-\\-5su34j936bgsg|xn\\-\\-5tzm5g" - + "|xn\\-\\-6frz82g|xn\\-\\-6qq986b3xl|xn\\-\\-80adxhks|xn\\-\\-80ao21a|xn\\-\\-80aqecdr1a" - + "|xn\\-\\-80asehdb|xn\\-\\-80aswg|xn\\-\\-8y0a063a|xn\\-\\-90a3ac|xn\\-\\-90ae|xn\\-\\-90ais" - + "|xn\\-\\-9dbq2a|xn\\-\\-9et52u|xn\\-\\-9krt00a|xn\\-\\-b4w605ferd|xn\\-\\-bck1b9a5dre4c" - + "|xn\\-\\-c1avg|xn\\-\\-c2br7g|xn\\-\\-cck2b3b|xn\\-\\-cckwcxetd|xn\\-\\-cg4bki|xn\\-\\-clchc0ea0b2g2a9gcd" - + "|xn\\-\\-czr694b|xn\\-\\-czrs0t|xn\\-\\-czru2d|xn\\-\\-d1acj3b|xn\\-\\-d1alf|xn\\-\\-e1a4c" - + "|xn\\-\\-eckvdtc9d|xn\\-\\-efvy88h|xn\\-\\-fct429k|xn\\-\\-fhbei|xn\\-\\-fiq228c5hs" - + "|xn\\-\\-fiq64b|xn\\-\\-fiqs8s|xn\\-\\-fiqz9s|xn\\-\\-fjq720a|xn\\-\\-flw351e|xn\\-\\-fpcrj9c3d" - + "|xn\\-\\-fzc2c9e2c|xn\\-\\-fzys8d69uvgm|xn\\-\\-g2xx48c|xn\\-\\-gckr3f0f|xn\\-\\-gecrj9c" - + "|xn\\-\\-gk3at1e|xn\\-\\-h2breg3eve|xn\\-\\-h2brj9c|xn\\-\\-h2brj9c8c|xn\\-\\-hxt814e" - + "|xn\\-\\-i1b6b1a6a2e|xn\\-\\-imr513n|xn\\-\\-io0a7i|xn\\-\\-j1aef|xn\\-\\-j1amh|xn\\-\\-j6w193g" - + "|xn\\-\\-jlq480n2rg|xn\\-\\-jvr189m|xn\\-\\-kcrx77d1x4a|xn\\-\\-kprw13d|xn\\-\\-kpry57d" - + "|xn\\-\\-kput3i|xn\\-\\-l1acc|xn\\-\\-lgbbat1ad8j|xn\\-\\-mgb9awbf|xn\\-\\-mgba3a3ejt" - + "|xn\\-\\-mgba3a4f16a|xn\\-\\-mgba7c0bbn0a|xn\\-\\-mgbaakc7dvf|xn\\-\\-mgbaam7a8h|xn\\-\\-mgbab2bd" - + "|xn\\-\\-mgbah1a3hjkrd|xn\\-\\-mgbai9azgqp6j|xn\\-\\-mgbayh7gpa|xn\\-\\-mgbbh1a|xn\\-\\-mgbbh1a71e" - + "|xn\\-\\-mgbc0a9azcg|xn\\-\\-mgbca7dzdo|xn\\-\\-mgbcpq6gpa1a|xn\\-\\-mgberp4a5d4ar|xn\\-\\-mgbgu82a" - + "|xn\\-\\-mgbi4ecexp|xn\\-\\-mgbpl2fh|xn\\-\\-mgbt3dhd|xn\\-\\-mgbtx2b|xn\\-\\-mgbx4cd0ab" - + "|xn\\-\\-mix891f|xn\\-\\-mk1bu44c|xn\\-\\-mxtq1m|xn\\-\\-ngbc5azd|xn\\-\\-ngbe9e0a|xn\\-\\-ngbrx" - + "|xn\\-\\-node|xn\\-\\-nqv7f|xn\\-\\-nqv7fs00ema|xn\\-\\-nyqy26a|xn\\-\\-o3cw4h|xn\\-\\-ogbpf8fl" - + "|xn\\-\\-otu796d|xn\\-\\-p1acf|xn\\-\\-p1ai|xn\\-\\-pgbs0dh|xn\\-\\-pssy2u|xn\\-\\-q7ce6a" - + "|xn\\-\\-q9jyb4c|xn\\-\\-qcka1pmc|xn\\-\\-qxa6a|xn\\-\\-qxam|xn\\-\\-rhqv96g|xn\\-\\-rovu88b" - + "|xn\\-\\-rvc1e0am3e|xn\\-\\-s9brj9c|xn\\-\\-ses554g|xn\\-\\-t60b56a|xn\\-\\-tckwe|xn\\-\\-tiq49xqyj" - + "|xn\\-\\-unup4y|xn\\-\\-vermgensberater\\-ctb|xn\\-\\-vermgensberatung\\-pwb|xn\\-\\-vhquv" - + "|xn\\-\\-vuq861b|xn\\-\\-w4r85el8fhu5dnra|xn\\-\\-w4rs40l|xn\\-\\-wgbh1c|xn\\-\\-wgbl6a" - + "|xn\\-\\-xhq521b|xn\\-\\-xkc2al3hye2a|xn\\-\\-xkc2dl3a5ee0h|xn\\-\\-y9a3aq|xn\\-\\-yfro4i67o" - + "|xn\\-\\-ygbi2ammx|xn\\-\\-zfr164b|xxx|xyz)" + + "|xbox|xerox|xfinity|xihuan|xin|xn--11b4c3d|xn--1ck2e1b|xn--1qqw23a|xn--2scrj9c" + + "|xn--30rr7y|xn--3bst00m|xn--3ds443g|xn--3e0b707e|xn--3hcrj9c|xn--3pxu8k" + + "|xn--42c2d9a|xn--45br5cyl|xn--45brj9c|xn--45q11c|xn--4dbrk0ce|xn--4gbrim" + + "|xn--54b7fta0cc|xn--55qw42g|xn--55qx5d|xn--5su34j936bgsg|xn--5tzm5g" + + "|xn--6frz82g|xn--6qq986b3xl|xn--80adxhks|xn--80ao21a|xn--80aqecdr1a" + + "|xn--80asehdb|xn--80aswg|xn--8y0a063a|xn--90a3ac|xn--90ae|xn--90ais" + + "|xn--9dbq2a|xn--9et52u|xn--9krt00a|xn--b4w605ferd|xn--bck1b9a5dre4c" + + "|xn--c1avg|xn--c2br7g|xn--cck2b3b|xn--cckwcxetd|xn--cg4bki|xn--clchc0ea0b2g2a9gcd" + + "|xn--czr694b|xn--czrs0t|xn--czru2d|xn--d1acj3b|xn--d1alf|xn--e1a4c" + + "|xn--eckvdtc9d|xn--efvy88h|xn--fct429k|xn--fhbei|xn--fiq228c5hs" + + "|xn--fiq64b|xn--fiqs8s|xn--fiqz9s|xn--fjq720a|xn--flw351e|xn--fpcrj9c3d" + + "|xn--fzc2c9e2c|xn--fzys8d69uvgm|xn--g2xx48c|xn--gckr3f0f|xn--gecrj9c" + + "|xn--gk3at1e|xn--h2breg3eve|xn--h2brj9c|xn--h2brj9c8c|xn--hxt814e" + + "|xn--i1b6b1a6a2e|xn--imr513n|xn--io0a7i|xn--j1aef|xn--j1amh|xn--j6w193g" + + "|xn--jlq480n2rg|xn--jvr189m|xn--kcrx77d1x4a|xn--kprw13d|xn--kpry57d" + + "|xn--kput3i|xn--l1acc|xn--lgbbat1ad8j|xn--mgb9awbf|xn--mgba3a3ejt" + + "|xn--mgba3a4f16a|xn--mgba7c0bbn0a|xn--mgbaakc7dvf|xn--mgbaam7a8h|xn--mgbab2bd" + + "|xn--mgbah1a3hjkrd|xn--mgbai9azgqp6j|xn--mgbayh7gpa|xn--mgbbh1a|xn--mgbbh1a71e" + + "|xn--mgbc0a9azcg|xn--mgbca7dzdo|xn--mgbcpq6gpa1a|xn--mgberp4a5d4ar|xn--mgbgu82a" + + "|xn--mgbi4ecexp|xn--mgbpl2fh|xn--mgbt3dhd|xn--mgbtx2b|xn--mgbx4cd0ab" + + "|xn--mix891f|xn--mk1bu44c|xn--mxtq1m|xn--ngbc5azd|xn--ngbe9e0a|xn--ngbrx" + + "|xn--node|xn--nqv7f|xn--nqv7fs00ema|xn--nyqy26a|xn--o3cw4h|xn--ogbpf8fl" + + "|xn--otu796d|xn--p1acf|xn--p1ai|xn--pgbs0dh|xn--pssy2u|xn--q7ce6a" + + "|xn--q9jyb4c|xn--qcka1pmc|xn--qxa6a|xn--qxam|xn--rhqv96g|xn--rovu88b" + + "|xn--rvc1e0am3e|xn--s9brj9c|xn--ses554g|xn--t60b56a|xn--tckwe|xn--tiq49xqyj" + + "|xn--unup4y|xn--vermgensberater-ctb|xn--vermgensberatung-pwb|xn--vhquv" + + "|xn--vuq861b|xn--w4r85el8fhu5dnra|xn--w4rs40l|xn--wgbh1c|xn--wgbl6a" + + "|xn--xhq521b|xn--xkc2al3hye2a|xn--xkc2dl3a5ee0h|xn--y9a3aq|xn--yfro4i67o" + + "|xn--ygbi2ammx|xn--zfr164b|xxx|xyz)" + "|(?:yachts|yahoo|yamaxun|yandex|yodobashi|yoga|yokohama|you|youtube|yun|y[et])" + "|(?:zappos|zara|zero|zip|zone|zuerich|z[amw]))") @@ -200,8 +200,12 @@ internal object Patterns { * Valid UCS characters defined in RFC 3987. Excludes space characters. */ // language=RegExp - private const val UCS_CHAR = "[" + - "\u00A0-\uD7FF" + + private const val UCS_CHAR = + "\u00A1-\u1FFF" + + "\u200B-\u2027" + + "\u202A-\u202E" + + "\u2030-\u2FFF" + + "\u3001-\uD7FF" + "\uF900-\uFDCF" + "\uFDF0-\uFFEF" + "\uD800\uDC00-\uD83F\uDFFD" + @@ -217,8 +221,7 @@ internal object Patterns { "\uDA80\uDC00-\uDABF\uDFFD" + "\uDAC0\uDC00-\uDAFF\uDFFD" + "\uDB00\uDC00-\uDB3F\uDFFD" + - "\uDB44\uDC00-\uDB7F\uDFFD" + - "&&[^\u00A0[\u2000-\u200A]\u2028\u2029\u202F\u3000]]" + "\uDB44\uDC00-\uDB7F\uDFFD" /** * Valid characters for IRI label defined in RFC 3987. @@ -242,7 +245,7 @@ internal object Patterns { * RFC 3492 references RFC 1034 and limits Punycode algorithm output to 63 characters. */ // language=RegExp - private const val PUNYCODE_TLD = """xn\-\-[\w\-]{0,58}\w""" + private const val PUNYCODE_TLD = """xn--[\w\-]{0,58}\w""" // language=RegExp private const val TLD = """($PUNYCODE_TLD|[$TLD_CHAR]{2,63})""" @@ -262,15 +265,15 @@ internal object Patterns { private const val WORD_BOUNDARY = "(?:\\b|$|^|(?=\\s))" // language=RegExp - private const val USER_INFO = ("(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" - + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" - + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@") + private const val USER_INFO = ("(?:[a-zA-Z0-9$\\-_.+!*'()" + + ",;?&=]|(?:%[a-fA-F0-9]{2})){1,64}(?::(?:[a-zA-Z0-9$\\-_" + + ".+!*'(),;?&=]|(?:%[a-fA-F0-9]{2})){1,25})?@") // language=RegExp - private const val PORT_NUMBER = "\\:\\d{1,5}" + private const val PORT_NUMBER = ":\\d{1,5}" // language=RegExp - private const val PATH_AND_QUERY = """[/\?](?:(?:[$LABEL_CHAR;/\?:@&=#~\-\.\+!\*'\(\),_\$])|(?:%[a-fA-F0-9]{2}))*""" + private const val PATH_AND_QUERY = """[/?](?:(?:[$LABEL_CHAR;/?:@&=#~\-.+!*'(),_$])|(?:%[a-fA-F0-9]{2}))*""" /** * Regular expression that matches known TLDs and punycode TLDs @@ -330,7 +333,7 @@ internal object Patterns { + ")") // language=RegExp - private const val MATRIX_PATH_AND_QUERY = """(?:(?:[$LABEL_CHAR;/\?:@&=#~$\-\.\+!\*'\(\),_\$])|(?:%[a-fA-F0-9]{2}))*""" + private const val MATRIX_PATH_AND_QUERY = """(?:(?:[$LABEL_CHAR;/?:@&=#~$\-.+!*'(),_$])|(?:%[a-fA-F0-9]{2}))*""" // language=RegExp private const val MATRIX_URI = ("""(${WORD_BOUNDARY}matrix:$MATRIX_PATH_AND_QUERY$WORD_BOUNDARY)""") -- GitLab From 53cdbea3aaa4ca3cd6dc12801bb0309005815f56 Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Tue, 22 Jul 2025 01:16:56 +0200 Subject: [PATCH 10/11] Fix failing regex in nodejs --- .../commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt index 83b15e2b8..260269a8f 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/util/Patterns.kt @@ -257,7 +257,7 @@ internal object Patterns { private const val DOMAIN_NAME = """($HOST_NAME|$IP_ADDRESS)""" // language=RegExp - private const val PROTOCOL = "(?i:http|https|rtsp)://" + private const val PROTOCOL = "(?:[hH][tT][tT][pP][sS]?)://" /* A word boundary or end of input. This is to stop foo.sure from matching as foo.su */ // NOTE: We've modified the word boundary matcher to add (?=\s) to match trailing slashes -- GitLab From 668dfd9e603d612414564f8f886bf5ac898384ca Mon Sep 17 00:00:00 2001 From: Janne Mareike Koschinski Date: Tue, 22 Jul 2025 12:46:42 +0200 Subject: [PATCH 11/11] fix issue with overlapping mentions in anchor tags --- .../kotlin/net/folivo/trixnity/core/MatrixRegex.kt | 12 +++++++----- .../net/folivo/trixnity/core/MatrixRegexTest.kt | 12 ++++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt index 51891d3b7..cace0e37e 100644 --- a/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt +++ b/trixnity-core/src/commonMain/kotlin/net/folivo/trixnity/core/MatrixRegex.kt @@ -16,6 +16,7 @@ object MatrixRegex { fun findMentions(message: String): Map { val links = findLinkMentions(message) + println(links) val users = findIdMentions(message) val linksRange = links.keys.sortedBy { it.first } val uniqueUsers = users.filter { (user, _) -> @@ -61,12 +62,13 @@ object MatrixRegex { } } - private fun List.overlaps(match: IntRange): Boolean { - val index = binarySearch { other -> + private fun List.overlaps(user: IntRange): Boolean { + val index = binarySearch { link -> when { - other.first > match.first -> -1 - other.last < match.last -> 1 - else -> 0 + user.last < link.first -> 1 + user.first > link.last -> -1 + user.first >= link.first && user.last <= link.last -> 0 + else -> -1 } } return index >= 0 diff --git a/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixRegexTest.kt b/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixRegexTest.kt index c51b7c959..64d848d1c 100644 --- a/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixRegexTest.kt +++ b/trixnity-core/src/commonTest/kotlin/net/folivo/trixnity/core/MatrixRegexTest.kt @@ -645,5 +645,17 @@ class MatrixRegexTest : TrixnityBaseTest() { ), actual = MatrixRegex.findMentions(content) ) + assertEquals( + expected = mapOf( + 9..44 to Mention.User(userId=UserId("@user:matrix.org")), + 92..171 to Mention.Room(roomId=RoomId("!WvOltebgJfkgHzhfpW:matrix.org"), parameters=parametersOf("via" to listOf("matrix.org", "imbitbu.de"))), + 199..323 to Mention.Event(roomId=RoomId("!WvOltebgJfkgHzhfpW:matrix.org"), eventId=EventId("\$KoEcMwZKqGpCeuMjAmt9zvmWgO72f7hDFkvfBMS479A"), parameters=parametersOf("via" to listOf("matrix.org", "imbitbu.de"))), + ), + actual = MatrixRegex.findMentions( + "Some Username: This is a user mention
" + + "https://matrix.to/#/!WvOltebgJfkgHzhfpW:matrix.org?via=matrix.org&via=imbitbu.de This is a room mention
" + + "https://matrix.to/#/!WvOltebgJfkgHzhfpW:matrix.org/\$KoEcMwZKqGpCeuMjAmt9zvmWgO72f7hDFkvfBMS479A?via=matrix.org&via=imbitbu.de This is an event mention" + ) + ) } } -- GitLab