1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
|
//! Domain name related scanning, used by both email and URL scanners.
//!
//! This is called domains for familiarity but it's about the authority part of URLs as defined in
//! https://datatracker.ietf.org/doc/html/rfc3986#section-3.2
//!
//! ```text
//! authority = [ userinfo "@" ] host [ ":" port ]
//!
//!
//! userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
//!
//! host = IP-literal / IPv4address / reg-name
//!
//! IP-literal = "[" ( IPv6address / IPvFuture ) "]"
//!
//! IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
//!
//! reg-name = *( unreserved / pct-encoded / sub-delims )
//!
//!
//! unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
//!
//! sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
//!
//! pct-encoded = "%" HEXDIG HEXDIG
//! ```
use std::char;
pub(crate) fn find_authority_end(
s: &str,
mut userinfo_allowed: bool,
require_host: bool,
port_allowed: bool,
iri_parsing_enabled: bool,
) -> (Option<usize>, Option<usize>) {
let mut end = Some(0);
let mut maybe_last_dot = None;
let mut last_dot = None;
let mut number_dots = 0;
let mut dot_allowed = false;
let mut hyphen_allowed = false;
let mut all_numeric = true;
let mut maybe_host = true;
let mut host_ended = false;
for (i, c) in s.char_indices() {
let can_be_last = match c {
// ALPHA
'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => {
if !iri_parsing_enabled && c > '\u{80}' {
break;
}
// Can start or end a domain label, but not numeric
dot_allowed = true;
hyphen_allowed = true;
last_dot = maybe_last_dot;
all_numeric = false;
if host_ended {
maybe_host = false;
}
!require_host || !host_ended
}
// DIGIT
'0'..='9' => {
// Same as above, except numeric
dot_allowed = true;
hyphen_allowed = true;
if last_dot != maybe_last_dot {
last_dot = maybe_last_dot;
number_dots += 1;
}
if host_ended {
maybe_host = false;
}
!require_host || !host_ended
}
// unreserved
'-' => {
// Hyphen can't be at start of a label, e.g. `-b` in `a.-b.com`
if !hyphen_allowed {
maybe_host = false;
}
// Hyphen can't be at end of a label, e.g. `b-` in `a.b-.com`
dot_allowed = false;
all_numeric = false;
!require_host
}
'.' => {
if !dot_allowed {
// Label can't be empty, e.g. `.example.com` or `a..com`
host_ended = true;
}
dot_allowed = false;
hyphen_allowed = false;
maybe_last_dot = Some(i);
false
}
'_' | '~' => {
// Hostnames can't contain these and we don't want to treat them as delimiters.
maybe_host = false;
false
}
// sub-delims
'!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => {
// Can't be in hostnames, but we treat them as delimiters
host_ended = true;
if !userinfo_allowed && require_host {
// We don't have to look further
break;
}
false
}
':' => {
// Could be in userinfo, or we're getting a port now.
if !userinfo_allowed && !port_allowed {
break;
}
// Don't advance the last dot when we get to port numbers
maybe_last_dot = last_dot;
false
}
'@' => {
if !userinfo_allowed {
// We already had userinfo, can't have another `@` in a valid authority.
return (None, None);
}
// Sike! Everything before this has been userinfo, so let's reset our
// opinions about all the host bits.
userinfo_allowed = false;
maybe_last_dot = None;
last_dot = None;
dot_allowed = false;
hyphen_allowed = false;
all_numeric = true;
maybe_host = true;
host_ended = false;
false
}
'/' => {
if !require_host {
// For schemes where we allow anything, we want to stop at delimiter characters
// except if we get a slash closing the URL, which happened here.
end = Some(i);
}
break;
}
_ => {
// Anything else, this might be the end of the authority (can be empty).
// Now let the rest of the code handle checking whether the end of the URL is
// valid.
break;
}
};
if can_be_last {
end = Some(i + c.len_utf8());
}
}
if require_host {
if maybe_host {
if all_numeric {
// For IPv4 addresses, require 4 numbers
if number_dots != 3 {
return (None, None);
}
} else {
// If we have something that is not just numeric (not an IP address),
// check that the TLD looks reasonable. This is to avoid linking things like
// `abc@v1.1`.
if let Some(last_dot) = last_dot {
if !valid_tld(&s[last_dot + 1..]) {
return (None, None);
}
}
}
return (end, last_dot);
} else {
return (None, None);
}
} else {
return (end, last_dot);
}
}
fn valid_tld(tld: &str) -> bool {
tld.chars()
.take_while(|c| c.is_ascii_alphabetic())
.take(2)
.count()
>= 2
}
|