[go: up one dir, main page]

File: url.rs

package info (click to toggle)
rust-linkify 0.10.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 220 kB
  • sloc: makefile: 2
file content (298 lines) | stat: -rw-r--r-- 10,321 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
use std::char;
use std::ops::Range;

use crate::domains::find_authority_end;
use crate::scanner::Scanner;

/// Minimum valid URL length
///
/// The shortest valid URL (without a scheme) might be g.cn (Google China),
/// which consists of four characters.
/// We set this as a lower threshold for parsing URLs from plaintext
/// to avoid false-positives and as a slight performance optimization.
/// This threshold might be adjusted in the future.
const MIN_URL_LENGTH: usize = 4;

const QUOTES: &[char] = &['\'', '\"'];

/// Scan for URLs starting from the trigger character ":" (requires "://").
///
/// Based on RFC 3986.
pub struct UrlScanner {
    pub iri_parsing_enabled: bool,
}

/// Scan for plain domains (without scheme) such as `test.com` or `test.com/hi-there`.
pub struct DomainScanner {
    pub iri_parsing_enabled: bool,
}

impl Scanner for UrlScanner {
    /// Scan for an URL at the given separator index in the string.
    ///
    /// Returns `None` if none was found.
    fn scan(&self, s: &str, separator: usize) -> Option<Range<usize>> {
        // There must be something before separator for scheme
        if separator == 0 {
            return None;
        }

        if !s[separator..].starts_with("://") {
            // We only support schemes with authority, not things like `myscheme:mything`.
            return None;
        }

        let after_separator = separator + "://".len();

        // Need at least one character after '//'
        if after_separator >= s.len() {
            return None;
        }

        if let (Some(start), quote) = find_scheme_start(&s[0..separator]) {
            let scheme = &s[start..separator];
            let s = &s[after_separator..];

            let require_host = scheme_requires_host(scheme);

            if let (Some(after_authority), _) =
                find_authority_end(s, true, require_host, true, self.iri_parsing_enabled)
            {
                if let Some(end) =
                    find_url_end(&s[after_authority..], quote, self.iri_parsing_enabled)
                {
                    if after_authority == 0 && end == 0 {
                        return None;
                    }

                    let range = Range {
                        start,
                        end: after_separator + after_authority + end,
                    };
                    return Some(range);
                }
            }
        }

        None
    }
}

impl Scanner for DomainScanner {
    fn scan(&self, s: &str, separator: usize) -> Option<Range<usize>> {
        // There must be something before separator for domain, and a minimum number of characters
        if separator == 0 || s.len() < MIN_URL_LENGTH {
            return None;
        }

        if let (Some(start), quote) = find_domain_start(&s[0..separator], self.iri_parsing_enabled)
        {
            let s = &s[start..];

            if let (Some(domain_end), Some(_)) =
                find_authority_end(s, false, true, true, self.iri_parsing_enabled)
            {
                if let Some(end) = find_url_end(&s[domain_end..], quote, self.iri_parsing_enabled) {
                    let range = Range {
                        start,
                        end: start + domain_end + end,
                    };
                    return Some(range);
                }
            }
        }

        None
    }
}

/// Find start of scheme, e.g. from `https://`, start at `s` and end at `h`.
fn find_scheme_start(s: &str) -> (Option<usize>, Option<char>) {
    let mut first = None;
    let mut special = None;
    let mut quote = None;
    for (i, c) in s.char_indices().rev() {
        match c {
            'a'..='z' | 'A'..='Z' => first = Some(i),
            '0'..='9' => special = Some(i),
            '+' | '-' | '.' => {}
            '@' => return (None, None),
            c if QUOTES.contains(&c) => {
                // Check if there's a quote before the scheme,
                // and stop once we encounter one of those quotes.
                // https://github.com/robinst/linkify/issues/20
                quote = Some(c);
                break;
            }
            _ => break,
        }
    }

    // We don't want to extract "abc://foo" out of "1abc://foo".
    // ".abc://foo" and others are ok though, as they feel more like separators.
    if let Some(first) = first {
        if let Some(special) = special {
            // Comparing the byte indices with `- 1` is ok as scheme must be ASCII
            if first > 0 && first - 1 == special {
                return (None, quote);
            }
        }
    }
    (first, quote)
}

/// Whether a scheme requires that authority looks like a host name (domain or IP address) or not
/// (can contain reg-name with arbitrary allowed characters).
///
/// We could make this configurable, but let's keep it simple until someone asks (hi!).
fn scheme_requires_host(scheme: &str) -> bool {
    match scheme {
        "https" | "http" | "ftp" | "ssh" => true,
        _ => false,
    }
}

/// Find the start of a plain domain URL (no scheme), e.g. from `blog.`, start at `g` and end at `b`.
/// The rules are:
/// - Domain is labels separated by `.`. Because we're starting at the first `.`, we only need to
///   handle one label.
/// - Label can not start or end with `-`
/// - Label can contain letters, digits, `-` or Unicode if iri_allowed flag is true
fn find_domain_start(s: &str, iri_parsing_enabled: bool) -> (Option<usize>, Option<char>) {
    let mut first = None;
    let mut quote = None;

    for (i, c) in s.char_indices().rev() {
        match c {
            'a'..='z' | 'A'..='Z' | '0'..='9' => first = Some(i),
            '\u{80}'..=char::MAX if iri_parsing_enabled => first = Some(i),
            // If we had something valid like `https://www.` we'd have found it with the ":"
            // scanner already. We don't want to allow `.../www.example.com` just by itself.
            // We *could* allow `//www.example.com` (scheme-relative URLs) in the future.
            '/' => return (None, None),
            // Similar to above, if this was an email we'd have found it already.
            '@' => return (None, None),
            // If this was a valid domain, we'd have extracted it already from the previous "."
            '.' => return (None, None),
            '-' => {
                if first == None {
                    // Domain label can't end with `-`
                    return (None, None);
                } else {
                    first = Some(i);
                }
            }
            c if QUOTES.contains(&c) => {
                // Check if there's a quote before, and stop once we encounter one of those quotes,
                // e.g. with `"www.example.com"`
                quote = Some(c);
                break;
            }
            _ => break,
        }
    }

    if let Some(first) = first {
        if s[first..].starts_with('-') {
            // Domain label can't start with `-`
            return (None, None);
        }
    }

    (first, quote)
}

/// Find the end of a URL. At this point we already scanned past a valid authority. So e.g. in
/// `https://example.com/foo` we're starting at `/` and want to end at `o`.
fn find_url_end(s: &str, quote: Option<char>, iri_parsing_enabled: bool) -> Option<usize> {
    let mut round = 0;
    let mut square = 0;
    let mut curly = 0;
    let mut single_quote = false;

    let mut previous_can_be_last = true;
    let mut end = Some(0);

    if !s[0..].starts_with("/") && !s[0..].starts_with("?") {
        return Some(0);
    }

    for (i, c) in s.char_indices() {
        let can_be_last = match c {
            '\u{00}'..='\u{1F}' | ' ' | '|' | '\"' | '<' | '>' | '`' | '\u{7F}'..='\u{9F}' => {
                // These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987.
                // Some characters are not in the above list, even they are not in "unreserved"
                // or "reserved":
                //   '\\', '^', '{', '}'
                // The reason for this is that other link detectors also allow them. Also see
                // below, we require the braces to be balanced.
                break;
            }
            '?' | '!' | '.' | ',' | ':' | ';' | '*' => {
                // These may be part of an URL but not at the end. It's not that the spec
                // doesn't allow them, but they are frequently used in plain text as delimiters
                // where they're not meant to be part of the URL.
                false
            }
            '/' => {
                // This may be part of an URL and at the end, but not if the previous character
                // can't be the end of an URL
                previous_can_be_last
            }
            '(' => {
                round += 1;
                false
            }
            ')' => {
                round -= 1;
                if round < 0 {
                    // More closing than opening brackets, stop now
                    break;
                }
                true
            }
            '[' => {
                square += 1;
                false
            }
            ']' => {
                square -= 1;
                if square < 0 {
                    // More closing than opening brackets, stop now
                    break;
                }
                true
            }
            '{' => {
                curly += 1;
                false
            }
            '}' => {
                curly -= 1;
                if curly < 0 {
                    // More closing than opening brackets, stop now
                    break;
                }
                true
            }
            _ if Some(c) == quote => {
                // Found matching quote from beginning of URL, stop now
                break;
            }
            '\'' => {
                single_quote = !single_quote;
                // A single quote can only be the end of an URL if there's an even number
                !single_quote
            }
            '\u{80}'..=char::MAX if !iri_parsing_enabled => false,

            _ => true,
        };
        if can_be_last {
            end = Some(i + c.len_utf8());
        }
        previous_can_be_last = can_be_last;
    }

    end
}