Ada 3.4.4
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_pattern_helpers.cpp
Go to the documentation of this file.
1#if ADA_INCLUDE_URL_PATTERN
3
4#include <algorithm>
5#include <array>
6#include <charconv>
7#include <optional>
8#include <ranges>
9#include <string>
10
11#include "ada/character_sets.h"
12#include "ada/helpers.h"
13#include "ada/scheme.h"
14#include "ada/unicode.h"
15
16namespace ada::url_pattern_helpers {
17
18std::tuple<std::string, std::vector<std::string>>
19generate_regular_expression_and_name_list(
20 const std::vector<url_pattern_part>& part_list,
21 url_pattern_compile_component_options options) {
22 // Let result be "^"
23 std::string result = "^";
24 // Reserve capacity to reduce reallocations
25 result.reserve(part_list.size() * 16);
26
27 // Let name list be a new list
28 std::vector<std::string> name_list{};
29 name_list.reserve(part_list.size());
30
31 // Pre-generate segment wildcard regexp if needed (avoids repeated generation)
32 std::string segment_wildcard_regexp;
33
34 // For each part of part list:
35 for (const url_pattern_part& part : part_list) {
36 // If part's type is "fixed-text":
37 if (part.type == url_pattern_part_type::FIXED_TEXT) {
38 // If part's modifier is "none"
39 if (part.modifier == url_pattern_part_modifier::none) {
40 result.append(escape_regexp_string(part.value));
41 } else {
42 // (?:<fixed text>)<modifier>
43 result.append("(?:");
44 result.append(escape_regexp_string(part.value));
45 result.push_back(')');
46 result.append(convert_modifier_to_string(part.modifier));
47 }
48 continue;
49 }
50
51 // Assert: part's name is not the empty string
52 ADA_ASSERT_TRUE(!part.name.empty());
53 name_list.push_back(part.name);
54
55 // Use string_view to avoid copies where possible
56 std::string_view regexp_value = part.value;
57
58 if (part.type == url_pattern_part_type::SEGMENT_WILDCARD) {
59 // Lazy generate segment wildcard regexp
60 if (segment_wildcard_regexp.empty()) {
61 segment_wildcard_regexp = generate_segment_wildcard_regexp(options);
62 }
63 regexp_value = segment_wildcard_regexp;
64 } else if (part.type == url_pattern_part_type::FULL_WILDCARD) {
65 regexp_value = ".*";
66 }
67
68 // If part's prefix is the empty string and part's suffix is the empty
69 // string
70 if (part.prefix.empty() && part.suffix.empty()) {
71 // If part's modifier is "none" or "optional"
72 if (part.modifier == url_pattern_part_modifier::none ||
73 part.modifier == url_pattern_part_modifier::optional) {
74 // (<regexp value>)<modifier>
75 result.push_back('(');
76 result.append(regexp_value);
77 result.push_back(')');
78 result.append(convert_modifier_to_string(part.modifier));
79 } else {
80 // ((?:<regexp value>)<modifier>)
81 result.append("((?:");
82 result.append(regexp_value);
83 result.push_back(')');
84 result.append(convert_modifier_to_string(part.modifier));
85 result.push_back(')');
86 }
87 continue;
88 }
89
90 // If part's modifier is "none" or "optional"
91 if (part.modifier == url_pattern_part_modifier::none ||
92 part.modifier == url_pattern_part_modifier::optional) {
93 // (?:<prefix>(<regexp value>)<suffix>)<modifier>
94 result.append("(?:");
95 result.append(escape_regexp_string(part.prefix));
96 result.push_back('(');
97 result.append(regexp_value);
98 result.push_back(')');
99 result.append(escape_regexp_string(part.suffix));
100 result.push_back(')');
101 result.append(convert_modifier_to_string(part.modifier));
102 continue;
103 }
104
105 // Assert: part's modifier is "zero-or-more" or "one-or-more"
106 ADA_ASSERT_TRUE(part.modifier == url_pattern_part_modifier::zero_or_more ||
107 part.modifier == url_pattern_part_modifier::one_or_more);
108
109 // Assert: part's prefix is not the empty string or part's suffix is not the
110 // empty string
111 ADA_ASSERT_TRUE(!part.prefix.empty() || !part.suffix.empty());
112
113 // (?:<prefix>((?:<regexp value>)(?:<suffix><prefix>(?:<regexp
114 // value>))*)<suffix>)?
115 // Append "(?:" to the end of result.
116 result.append("(?:");
117 // Append the result of running escape a regexp string given part's prefix
118 // to the end of result.
119 result.append(escape_regexp_string(part.prefix));
120 // Append "((?:" to the end of result.
121 result.append("((?:");
122 // Append regexp value to the end of result.
123 result.append(regexp_value);
124 // Append ")(?:" to the end of result.
125 result.append(")(?:");
126 // Append the result of running escape a regexp string given part's suffix
127 // to the end of result.
128 result.append(escape_regexp_string(part.suffix));
129 // Append the result of running escape a regexp string given part's prefix
130 // to the end of result.
131 result.append(escape_regexp_string(part.prefix));
132 // Append "(?:" to the end of result.
133 result.append("(?:");
134 // Append regexp value to the end of result.
135 result.append(regexp_value);
136 // Append "))*)" to the end of result.
137 result.append("))*)");
138 // Append the result of running escape a regexp string given part's suffix
139 // to the end of result.
140 result.append(escape_regexp_string(part.suffix));
141 // Append ")" to the end of result.
142 result.append(")");
143
144 // If part's modifier is "zero-or-more" then append "?" to the end of result
145 if (part.modifier == url_pattern_part_modifier::zero_or_more) {
146 result += "?";
147 }
148 }
149
150 // Append "$" to the end of result
151 result += "$";
152
153 // Return (result, name list)
154 return {std::move(result), std::move(name_list)};
155}
156
157bool is_ipv6_address(std::string_view input) noexcept {
158 // If input's code point length is less than 2, then return false.
159 if (input.size() < 2) return false;
160
161 // Let input code points be input interpreted as a list of code points.
162 // If input code points[0] is U+005B ([), then return true.
163 if (input.front() == '[') return true;
164 // If input code points[0] is U+007B ({) and input code points[1] is U+005B
165 // ([), then return true.
166 if (input.starts_with("{[")) return true;
167 // If input code points[0] is U+005C (\‍) and input code points[1] is U+005B
168 // ([), then return true.
169 return input.starts_with("\\[");
170}
171
172std::string_view convert_modifier_to_string(
173 url_pattern_part_modifier modifier) {
174 switch (modifier) {
175 // If modifier is "zero-or-more", then return "*".
176 case url_pattern_part_modifier::zero_or_more:
177 return "*";
178 // If modifier is "optional", then return "?".
179 case url_pattern_part_modifier::optional:
180 return "?";
181 // If modifier is "one-or-more", then return "+".
182 case url_pattern_part_modifier::one_or_more:
183 return "+";
184 // Return the empty string.
185 default:
186 return "";
187 }
188}
189
190std::string generate_segment_wildcard_regexp(
191 url_pattern_compile_component_options options) {
192 // Let result be "[^".
193 std::string result = "[^";
194 // Append the result of running escape a regexp string given options's
195 // delimiter code point to the end of result.
196 result.append(escape_regexp_string(options.get_delimiter()));
197 // Append "]+?" to the end of result.
198 result.append("]+?");
199 // Return result.
200 ada_log("generate_segment_wildcard_regexp result: ", result);
201 return result;
202}
203
204namespace {
205// Unified lookup table for URL pattern character classification
206// Bit flags for different character types
207constexpr uint8_t CHAR_SCHEME = 1; // valid in scheme (a-z, A-Z, 0-9, +, -, .)
208constexpr uint8_t CHAR_UPPER = 2; // uppercase letter (needs lowercasing)
209constexpr uint8_t CHAR_SIMPLE_HOSTNAME = 4; // simple hostname (a-z, 0-9, -, .)
210constexpr uint8_t CHAR_SIMPLE_PATHNAME =
211 8; // simple pathname (a-z, A-Z, 0-9, /, -, _, ~)
212
213constexpr std::array<uint8_t, 256> char_class_table = []() consteval {
214 std::array<uint8_t, 256> table{};
215 for (int c = 'a'; c <= 'z'; c++)
216 table[c] = CHAR_SCHEME | CHAR_SIMPLE_HOSTNAME | CHAR_SIMPLE_PATHNAME;
217 for (int c = 'A'; c <= 'Z'; c++)
218 table[c] = CHAR_SCHEME | CHAR_UPPER | CHAR_SIMPLE_PATHNAME;
219 for (int c = '0'; c <= '9'; c++)
220 table[c] = CHAR_SCHEME | CHAR_SIMPLE_HOSTNAME | CHAR_SIMPLE_PATHNAME;
221 table['+'] = CHAR_SCHEME;
222 table['-'] = CHAR_SCHEME | CHAR_SIMPLE_HOSTNAME | CHAR_SIMPLE_PATHNAME;
223 table['.'] =
224 CHAR_SCHEME | CHAR_SIMPLE_HOSTNAME; // not pathname (needs normalization)
225 table['/'] = CHAR_SIMPLE_PATHNAME;
226 table['_'] = CHAR_SIMPLE_PATHNAME;
227 table['~'] = CHAR_SIMPLE_PATHNAME;
228 return table;
229}();
230} // namespace
231
232tl::expected<std::string, errors> canonicalize_protocol(
233 std::string_view input) {
234 ada_log("canonicalize_protocol called with input=", input);
235 if (input.empty()) [[unlikely]] {
236 return "";
237 }
238
239 if (input.ends_with(":")) {
240 input.remove_suffix(1);
241 }
242
243 // Fast path: special schemes are already canonical
244 if (scheme::is_special(input)) {
245 return std::string(input);
246 }
247
248 // Fast path: validate scheme chars and check for uppercase
249 // First char must be alpha (not +, -, ., or digit)
250 uint8_t first_flags = char_class_table[static_cast<uint8_t>(input[0])];
251 if (!(first_flags & CHAR_SCHEME) || input[0] == '+' || input[0] == '-' ||
252 input[0] == '.' || unicode::is_ascii_digit(input[0])) {
253 return tl::unexpected(errors::type_error);
254 }
255
256 uint8_t needs_lowercase = first_flags & CHAR_UPPER;
257 for (size_t i = 1; i < input.size(); i++) {
258 uint8_t flags = char_class_table[static_cast<uint8_t>(input[i])];
259 if (!(flags & CHAR_SCHEME)) {
260 return tl::unexpected(errors::type_error);
261 }
262 needs_lowercase |= flags & CHAR_UPPER;
263 }
264
265 if (needs_lowercase == 0) {
266 return std::string(input);
267 }
268
269 std::string result(input);
270 unicode::to_lower_ascii(result.data(), result.size());
271 return result;
272}
273
274tl::expected<std::string, errors> canonicalize_username(
275 std::string_view input) {
276 // If value is the empty string, return value.
277 if (input.empty()) [[unlikely]] {
278 return "";
279 }
280 // Percent-encode the input using the userinfo percent-encode set.
282 input, character_sets::USERINFO_PERCENT_ENCODE);
283 if (idx == input.size()) {
284 // No encoding needed, return input as-is
285 return std::string(input);
286 }
287 // Percent-encode from the first character that needs encoding
288 return ada::unicode::percent_encode(
289 input, character_sets::USERINFO_PERCENT_ENCODE, idx);
290}
291
292tl::expected<std::string, errors> canonicalize_password(
293 std::string_view input) {
294 // If value is the empty string, return value.
295 if (input.empty()) [[unlikely]] {
296 return "";
297 }
298 // Percent-encode the input using the userinfo percent-encode set.
300 input, character_sets::USERINFO_PERCENT_ENCODE);
301 if (idx == input.size()) {
302 // No encoding needed, return input as-is
303 return std::string(input);
304 }
305 // Percent-encode from the first character that needs encoding
306 return ada::unicode::percent_encode(
307 input, character_sets::USERINFO_PERCENT_ENCODE, idx);
308}
309
310tl::expected<std::string, errors> canonicalize_hostname(
311 std::string_view input) {
312 ada_log("canonicalize_hostname input=", input);
313 if (input.empty()) [[unlikely]] {
314 return "";
315 }
316
317 // Fast path: simple hostnames (lowercase ASCII, digits, -, .) need no IDNA
318 bool needs_processing = false;
319 for (char c : input) {
320 needs_processing |=
321 !(char_class_table[static_cast<uint8_t>(c)] & CHAR_SIMPLE_HOSTNAME);
322 }
323 if (!needs_processing) {
324 return std::string(input);
325 }
326
327 // Let dummyURL be a new URL record.
328 // Let parseResult be the result of running the basic URL parser given value
329 // with dummyURL as url and hostname state as state override.
330
331 // IMPORTANT: The protocol needs to be a special protocol, otherwise the
332 // hostname will not be converted using IDNA.
333 auto url = ada::parse<url_aggregator>("https://dummy.test", nullptr);
334 ADA_ASSERT_TRUE(url);
335 // if (!isValidHostnameInput(hostname)) return kj::none;
336 if (!url->set_hostname(input)) {
337 // If parseResult is failure, then throw a TypeError.
338 return tl::unexpected(errors::type_error);
339 }
340 // Return dummyURL's host, serialized, or empty string if it is null.
341 return std::string(url->get_hostname());
342}
343
344tl::expected<std::string, errors> canonicalize_ipv6_hostname(
345 std::string_view input) {
346 ada_log("canonicalize_ipv6_hostname input=", input);
347 // TODO: Optimization opportunity: Use lookup table to speed up checking
348 if (std::ranges::any_of(input, [](char c) {
349 return c != '[' && c != ']' && c != ':' &&
350 !unicode::is_ascii_hex_digit(c);
351 })) {
352 return tl::unexpected(errors::type_error);
353 }
354 // Append the result of running ASCII lowercase given code point to the end of
355 // result.
356 auto hostname = std::string(input);
357 unicode::to_lower_ascii(hostname.data(), hostname.size());
358 return hostname;
359}
360
361tl::expected<std::string, errors> canonicalize_port(
362 std::string_view port_value) {
363 // If portValue is the empty string, return portValue.
364 if (port_value.empty()) [[unlikely]] {
365 return "";
366 }
367
368 // Remove ASCII tab or newline characters
369 std::string trimmed(port_value);
370 helpers::remove_ascii_tab_or_newline(trimmed);
371
372 if (trimmed.empty()) {
373 return "";
374 }
375
376 // Input should start with a digit character
377 if (!unicode::is_ascii_digit(trimmed.front())) {
378 return tl::unexpected(errors::type_error);
379 }
380
381 // Find the first non-digit character
382 auto first_non_digit =
383 std::ranges::find_if_not(trimmed, unicode::is_ascii_digit);
384 std::string_view digits_to_parse =
385 std::string_view(trimmed.data(), first_non_digit - trimmed.begin());
386
387 // Here we have that a range of ASCII digit characters identified
388 // by digits_to_parse. It is none empty.
389 // We want to determine whether it is a valid port number (0-65535).
390 // Clearly, if the length is greater than 5, it is invalid.
391 // If the length is 5, we need to compare lexicographically to "65535".
392 // Otherwise it is valid.
393 if (digits_to_parse.size() == 5) {
394 if (digits_to_parse > "65535") {
395 return tl::unexpected(errors::type_error);
396 }
397 } else if (digits_to_parse.size() > 5) {
398 return tl::unexpected(errors::type_error);
399 }
400 if (digits_to_parse[0] == '0' && digits_to_parse.size() > 1) {
401 // Leading zeros are not allowed for multi-digit ports
402 return tl::unexpected(errors::type_error);
403 }
404 // It is valid! Most times, we do not need to parse it into an integer.
405 return std::string(digits_to_parse);
406}
407
408tl::expected<std::string, errors> canonicalize_port_with_protocol(
409 std::string_view port_value, std::string_view protocol) {
410 // If portValue is the empty string, return portValue.
411 if (port_value.empty()) [[unlikely]] {
412 return "";
413 }
414
415 // Handle empty or trailing colon in protocol
416 if (protocol.empty()) {
417 protocol = "fake";
418 } else if (protocol.ends_with(":")) {
419 protocol.remove_suffix(1);
420 }
421
422 // Remove ASCII tab or newline characters
423 std::string trimmed(port_value);
424 helpers::remove_ascii_tab_or_newline(trimmed);
425
426 if (trimmed.empty()) {
427 return "";
428 }
429
430 // Input should start with a digit character
431 if (!unicode::is_ascii_digit(trimmed.front())) {
432 return tl::unexpected(errors::type_error);
433 }
434
435 // Find the first non-digit character
436 auto first_non_digit =
437 std::ranges::find_if_not(trimmed, unicode::is_ascii_digit);
438 std::string_view digits_to_parse =
439 std::string_view(trimmed.data(), first_non_digit - trimmed.begin());
440
441 // Parse the port number
442 uint16_t parsed_port{};
443 auto result = std::from_chars(digits_to_parse.data(),
444 digits_to_parse.data() + digits_to_parse.size(),
445 parsed_port);
446
447 if (result.ec == std::errc::result_out_of_range) {
448 return tl::unexpected(errors::type_error);
449 }
450
451 if (result.ec == std::errc()) {
452 // Check if this is the default port for the scheme
453 uint16_t default_port = scheme::get_special_port(protocol);
454
455 // If it's the default port for a special scheme, return empty string
456 if (default_port != 0 && default_port == parsed_port) {
457 return "";
458 }
459
460 // Successfully parsed, return as string
461 return std::to_string(parsed_port);
462 }
463
464 return tl::unexpected(errors::type_error);
465}
466
467tl::expected<std::string, errors> canonicalize_pathname(
468 std::string_view input) {
469 if (input.empty()) [[unlikely]] {
470 return "";
471 }
472
473 // Fast path: simple pathnames (no . which needs normalization) can be
474 // returned as-is
475 bool needs_processing = false;
476 for (char c : input) {
477 needs_processing |=
478 !(char_class_table[static_cast<uint8_t>(c)] & CHAR_SIMPLE_PATHNAME);
479 }
480 if (!needs_processing) {
481 return std::string(input);
482 }
483
484 // Let leading slash be true if the first code point in value is U+002F (/)
485 // and otherwise false.
486 const bool leading_slash = input.starts_with("/");
487 // Let modified value be "/-" if leading slash is false and otherwise the
488 // empty string.
489 const auto modified_value = leading_slash ? "" : "/-";
490 const auto full_url =
491 std::string("fake://fake-url") + modified_value + std::string(input);
492 if (auto url = ada::parse<url_aggregator>(full_url, nullptr)) {
493 const auto pathname = url->get_pathname();
494 // If leading slash is false, then set result to the code point substring
495 // from 2 to the end of the string within result.
496 if (!leading_slash) {
497 // pathname should start with "/-" but path traversal (e.g. "../../")
498 // can reduce it to just "/" which is shorter than 2 characters.
499 if (pathname.size() < 2) {
500 return tl::unexpected(errors::type_error);
501 }
502 return std::string(pathname.substr(2));
503 }
504 return std::string(pathname);
505 }
506 // If parseResult is failure, then throw a TypeError.
507 return tl::unexpected(errors::type_error);
508}
509
510tl::expected<std::string, errors> canonicalize_opaque_pathname(
511 std::string_view input) {
512 // If value is the empty string, return value.
513 if (input.empty()) [[unlikely]] {
514 return "";
515 }
516 // Let dummyURL be a new URL record.
517 // Set dummyURL's path to the empty string.
518 // Let parseResult be the result of running URL parsing given value with
519 // dummyURL as url and opaque path state as state override.
520 if (auto url =
521 ada::parse<url_aggregator>("fake:" + std::string(input), nullptr)) {
522 // Return the result of URL path serializing dummyURL.
523 return std::string(url->get_pathname());
524 }
525 // If parseResult is failure, then throw a TypeError.
526 return tl::unexpected(errors::type_error);
527}
528
529tl::expected<std::string, errors> canonicalize_search(std::string_view input) {
530 // If value is the empty string, return value.
531 if (input.empty()) [[unlikely]] {
532 return "";
533 }
534 // Remove leading '?' if present
535 std::string new_value;
536 new_value = input[0] == '?' ? input.substr(1) : input;
537 // Remove ASCII tab or newline characters
538 helpers::remove_ascii_tab_or_newline(new_value);
539
540 if (new_value.empty()) {
541 return "";
542 }
543
544 // Percent-encode using QUERY_PERCENT_ENCODE (for non-special URLs)
545 // Note: "fake://dummy.test" is not a special URL, so we use
546 // QUERY_PERCENT_ENCODE
548 new_value, character_sets::QUERY_PERCENT_ENCODE);
549 if (idx == new_value.size()) {
550 // No encoding needed
551 return new_value;
552 }
553 // Percent-encode from the first character that needs encoding
554 return ada::unicode::percent_encode(
555 new_value, character_sets::QUERY_PERCENT_ENCODE, idx);
556}
557
558tl::expected<std::string, errors> canonicalize_hash(std::string_view input) {
559 // If value is the empty string, return value.
560 if (input.empty()) [[unlikely]] {
561 return "";
562 }
563 // Remove leading '#' if present
564 std::string new_value;
565 new_value = input[0] == '#' ? input.substr(1) : input;
566 // Remove ASCII tab or newline characters
567 helpers::remove_ascii_tab_or_newline(new_value);
568
569 if (new_value.empty()) {
570 return "";
571 }
572
573 // Percent-encode using FRAGMENT_PERCENT_ENCODE
575 new_value, character_sets::FRAGMENT_PERCENT_ENCODE);
576 if (idx == new_value.size()) {
577 // No encoding needed
578 return new_value;
579 }
580 // Percent-encode from the first character that needs encoding
581 return ada::unicode::percent_encode(
582 new_value, character_sets::FRAGMENT_PERCENT_ENCODE, idx);
583}
584
585tl::expected<std::vector<token>, errors> tokenize(std::string_view input,
586 token_policy policy) {
587 ada_log("tokenize input: ", input);
588 // Let tokenizer be a new tokenizer.
589 // Set tokenizer's input to input.
590 // Set tokenizer's policy to policy.
591 auto tokenizer = Tokenizer(input, policy);
592 // While tokenizer's index is less than tokenizer's input's code point length:
593 while (tokenizer.index < tokenizer.input.size()) {
594 // Run seek and get the next code point given tokenizer and tokenizer's
595 // index.
596 tokenizer.seek_and_get_next_code_point(tokenizer.index);
597
598 // If tokenizer's code point is U+002A (*):
599 if (tokenizer.code_point == '*') {
600 // Run add a token with default position and length given tokenizer and
601 // "asterisk".
602 tokenizer.add_token_with_defaults(token_type::ASTERISK);
603 ada_log("add ASTERISK token");
604 // Continue.
605 continue;
606 }
607
608 // If tokenizer's code point is U+002B (+) or U+003F (?):
609 if (tokenizer.code_point == '+' || tokenizer.code_point == '?') {
610 // Run add a token with default position and length given tokenizer and
611 // "other-modifier".
612 tokenizer.add_token_with_defaults(token_type::OTHER_MODIFIER);
613 // Continue.
614 continue;
615 }
616
617 // If tokenizer's code point is U+005C (\‍):
618 if (tokenizer.code_point == '\\') {
619 // If tokenizer's index is equal to tokenizer's input's code point length
620 // - 1:
621 if (tokenizer.index == tokenizer.input.size() - 1) {
622 // Run process a tokenizing error given tokenizer, tokenizer's next
623 // index, and tokenizer's index.
624 if (auto error = tokenizer.process_tokenizing_error(
625 tokenizer.next_index, tokenizer.index)) {
626 ada_log("process_tokenizing_error failed");
627 return tl::unexpected(*error);
628 }
629 continue;
630 }
631
632 // Let escaped index be tokenizer's next index.
633 auto escaped_index = tokenizer.next_index;
634 // Run get the next code point given tokenizer.
635 tokenizer.get_next_code_point();
636 // Run add a token with default length given tokenizer, "escaped-char",
637 // tokenizer's next index, and escaped index.
638 tokenizer.add_token_with_default_length(
639 token_type::ESCAPED_CHAR, tokenizer.next_index, escaped_index);
640 ada_log("add ESCAPED_CHAR token on next_index ", tokenizer.next_index,
641 " with escaped index ", escaped_index);
642 // Continue.
643 continue;
644 }
645
646 // If tokenizer's code point is U+007B ({):
647 if (tokenizer.code_point == '{') {
648 // Run add a token with default position and length given tokenizer and
649 // "open".
650 tokenizer.add_token_with_defaults(token_type::OPEN);
651 ada_log("add OPEN token");
652 continue;
653 }
654
655 // If tokenizer's code point is U+007D (}):
656 if (tokenizer.code_point == '}') {
657 // Run add a token with default position and length given tokenizer and
658 // "close".
659 tokenizer.add_token_with_defaults(token_type::CLOSE);
660 ada_log("add CLOSE token");
661 continue;
662 }
663
664 // If tokenizer's code point is U+003A (:):
665 if (tokenizer.code_point == ':') {
666 // Let name position be tokenizer's next index.
667 auto name_position = tokenizer.next_index;
668 // Let name start be name position.
669 auto name_start = name_position;
670 // While name position is less than tokenizer's input's code point length:
671 while (name_position < tokenizer.input.size()) {
672 // Run seek and get the next code point given tokenizer and name
673 // position.
674 tokenizer.seek_and_get_next_code_point(name_position);
675 // Let first code point be true if name position equals name start and
676 // false otherwise.
677 bool first_code_point = name_position == name_start;
678 // Let valid code point be the result of running is a valid name code
679 // point given tokenizer's code point and first code point.
680 auto valid_code_point =
681 idna::valid_name_code_point(tokenizer.code_point, first_code_point);
682 ada_log("tokenizer.code_point=", uint32_t(tokenizer.code_point),
683 " first_code_point=", first_code_point,
684 " valid_code_point=", valid_code_point);
685 // If valid code point is false break.
686 if (!valid_code_point) break;
687 // Set name position to tokenizer's next index.
688 name_position = tokenizer.next_index;
689 }
690
691 // If name position is less than or equal to name start:
692 if (name_position <= name_start) {
693 // Run process a tokenizing error given tokenizer, name start, and
694 // tokenizer's index.
695 if (auto error = tokenizer.process_tokenizing_error(name_start,
696 tokenizer.index)) {
697 ada_log("process_tokenizing_error failed");
698 return tl::unexpected(*error);
699 }
700 // Continue
701 continue;
702 }
703
704 // Run add a token with default length given tokenizer, "name", name
705 // position, and name start.
706 tokenizer.add_token_with_default_length(token_type::NAME, name_position,
707 name_start);
708 continue;
709 }
710
711 // If tokenizer's code point is U+0028 (():
712 if (tokenizer.code_point == '(') {
713 // Let depth be 1.
714 size_t depth = 1;
715 // Let regexp position be tokenizer's next index.
716 auto regexp_position = tokenizer.next_index;
717 // Let regexp start be regexp position.
718 auto regexp_start = regexp_position;
719 // Let error be false.
720 bool error = false;
721
722 // While regexp position is less than tokenizer's input's code point
723 // length:
724 while (regexp_position < tokenizer.input.size()) {
725 // Run seek and get the next code point given tokenizer and regexp
726 // position.
727 tokenizer.seek_and_get_next_code_point(regexp_position);
728
729 // TODO: Optimization opportunity: The next 2 if statements can be
730 // merged. If the result of running is ASCII given tokenizer's code
731 // point is false:
732 if (!unicode::is_ascii(tokenizer.code_point)) {
733 // Run process a tokenizing error given tokenizer, regexp start, and
734 // tokenizer's index.
735 if (auto process_error = tokenizer.process_tokenizing_error(
736 regexp_start, tokenizer.index)) {
737 return tl::unexpected(*process_error);
738 }
739 // Set error to true.
740 error = true;
741 break;
742 }
743
744 // If regexp position equals regexp start and tokenizer's code point is
745 // U+003F (?):
746 if (regexp_position == regexp_start && tokenizer.code_point == '?') {
747 // Run process a tokenizing error given tokenizer, regexp start, and
748 // tokenizer's index.
749 if (auto process_error = tokenizer.process_tokenizing_error(
750 regexp_start, tokenizer.index)) {
751 return tl::unexpected(*process_error);
752 }
753 // Set error to true;
754 error = true;
755 break;
756 }
757
758 // If tokenizer's code point is U+005C (\‍):
759 if (tokenizer.code_point == '\\') {
760 // If regexp position equals tokenizer's input's code point length - 1
761 if (regexp_position == tokenizer.input.size() - 1) {
762 // Run process a tokenizing error given tokenizer, regexp start, and
763 // tokenizer's index.
764 if (auto process_error = tokenizer.process_tokenizing_error(
765 regexp_start, tokenizer.index)) {
766 return tl::unexpected(*process_error);
767 }
768 // Set error to true.
769 error = true;
770 break;
771 }
772 // Run get the next code point given tokenizer.
773 tokenizer.get_next_code_point();
774 // If the result of running is ASCII given tokenizer's code point is
775 // false:
776 if (!unicode::is_ascii(tokenizer.code_point)) {
777 // Run process a tokenizing error given tokenizer, regexp start, and
778 // tokenizer's index.
779 if (auto process_error = tokenizer.process_tokenizing_error(
780 regexp_start, tokenizer.index);
781 process_error.has_value()) {
782 return tl::unexpected(*process_error);
783 }
784 // Set error to true.
785 error = true;
786 break;
787 }
788 // Set regexp position to tokenizer's next index.
789 regexp_position = tokenizer.next_index;
790 continue;
791 }
792
793 // If tokenizer's code point is U+0029 ()):
794 if (tokenizer.code_point == ')') {
795 // Decrement depth by 1.
796 depth--;
797 // If depth is 0:
798 if (depth == 0) {
799 // Set regexp position to tokenizer's next index.
800 regexp_position = tokenizer.next_index;
801 // Break.
802 break;
803 }
804 } else if (tokenizer.code_point == '(') {
805 // Otherwise if tokenizer's code point is U+0028 (():
806 // Increment depth by 1.
807 depth++;
808 // If regexp position equals tokenizer's input's code point length -
809 // 1:
810 if (regexp_position == tokenizer.input.size() - 1) {
811 // Run process a tokenizing error given tokenizer, regexp start, and
812 // tokenizer's index.
813 if (auto process_error = tokenizer.process_tokenizing_error(
814 regexp_start, tokenizer.index)) {
815 return tl::unexpected(*process_error);
816 }
817 // Set error to true.
818 error = true;
819 break;
820 }
821 // Let temporary position be tokenizer's next index.
822 auto temporary_position = tokenizer.next_index;
823 // Run get the next code point given tokenizer.
824 tokenizer.get_next_code_point();
825 // If tokenizer's code point is not U+003F (?):
826 if (tokenizer.code_point != '?') {
827 // Run process a tokenizing error given tokenizer, regexp start, and
828 // tokenizer's index.
829 if (auto process_error = tokenizer.process_tokenizing_error(
830 regexp_start, tokenizer.index)) {
831 return tl::unexpected(*process_error);
832 }
833 // Set error to true.
834 error = true;
835 break;
836 }
837 // Set tokenizer's next index to temporary position.
838 tokenizer.next_index = temporary_position;
839 }
840 // Set regexp position to tokenizer's next index.
841 regexp_position = tokenizer.next_index;
842 }
843
844 // If error is true continue.
845 if (error) continue;
846 // If depth is not zero:
847 if (depth != 0) {
848 // Run process a tokenizing error given tokenizer, regexp start, and
849 // tokenizer's index.
850 if (auto process_error = tokenizer.process_tokenizing_error(
851 regexp_start, tokenizer.index)) {
852 return tl::unexpected(*process_error);
853 }
854 continue;
855 }
856 // Let regexp length be regexp position - regexp start - 1.
857 auto regexp_length = regexp_position - regexp_start - 1;
858 // If regexp length is zero:
859 if (regexp_length == 0) {
860 // Run process a tokenizing error given tokenizer, regexp start, and
861 // tokenizer's index.
862 if (auto process_error = tokenizer.process_tokenizing_error(
863 regexp_start, tokenizer.index)) {
864 ada_log("process_tokenizing_error failed");
865 return tl::unexpected(*process_error);
866 }
867 continue;
868 }
869 // Run add a token given tokenizer, "regexp", regexp position, regexp
870 // start, and regexp length.
871 tokenizer.add_token(token_type::REGEXP, regexp_position, regexp_start,
872 regexp_length);
873 continue;
874 }
875 // Run add a token with default position and length given tokenizer and
876 // "char".
877 tokenizer.add_token_with_defaults(token_type::CHAR);
878 }
879 // Run add a token with default length given tokenizer, "end", tokenizer's
880 // index, and tokenizer's index.
881 tokenizer.add_token_with_default_length(token_type::END, tokenizer.index,
882 tokenizer.index);
883
884 ada_log("tokenizer.token_list size is: ", tokenizer.token_list.size());
885 // Return tokenizer's token list.
886 return tokenizer.token_list;
887}
888
889namespace {
890constexpr std::array<uint8_t, 256> escape_pattern_table = []() consteval {
891 std::array<uint8_t, 256> out{};
892 for (auto& c : {'+', '*', '?', ':', '{', '}', '(', ')', '\\'}) {
893 out[c] = 1;
894 }
895 return out;
896}();
897
898constexpr bool should_escape_pattern_char(char c) {
899 return escape_pattern_table[static_cast<uint8_t>(c)];
900}
901} // namespace
902
903std::string escape_pattern_string(std::string_view input) {
904 ada_log("escape_pattern_string called with input=", input);
905 if (input.empty()) [[unlikely]] {
906 return "";
907 }
908 // Assert: input is an ASCII string.
910 // Let result be the empty string.
911 std::string result{};
912 // Reserve extra space for potential escapes
913 result.reserve(input.size() * 2);
914
915 // While index is less than input's length:
916 for (const char c : input) {
917 if (should_escape_pattern_char(c)) {
918 // Append U+005C (\‍) to the end of result.
919 result.push_back('\\');
920 }
921 // Append c to the end of result.
922 result.push_back(c);
923 }
924 // Return result.
925 return result;
926}
927
928namespace {
929constexpr std::array<uint8_t, 256> escape_regexp_table = []() consteval {
930 std::array<uint8_t, 256> out{};
931 for (auto& c : {'.', '+', '*', '?', '^', '$', '{', '}', '(', ')', '[', ']',
932 '|', '/', '\\'}) {
933 out[c] = 1;
934 }
935 return out;
936}();
937
938constexpr bool should_escape_regexp_char(char c) {
939 return escape_regexp_table[(uint8_t)c];
940}
941} // namespace
942
943std::string escape_regexp_string(std::string_view input) {
944 // Assert: input is an ASCII string.
945 ADA_ASSERT_TRUE(idna::is_ascii(input));
946 // Let result be the empty string.
947 std::string result{};
948 // Reserve extra space for potential escapes (worst case: all chars escaped)
949 result.reserve(input.size() * 2);
950 for (const char c : input) {
951 if (should_escape_regexp_char(c)) {
952 // Avoid temporary string allocation - directly append characters
953 result.push_back('\\');
954 result.push_back(c);
955 } else {
956 result.push_back(c);
957 }
958 }
959 return result;
960}
961
962std::string process_base_url_string(std::string_view input,
963 url_pattern_init::process_type type) {
964 // If type is not "pattern" return input.
965 if (type != url_pattern_init::process_type::pattern) {
966 return std::string(input);
967 }
968 // Return the result of escaping a pattern string given input.
969 return escape_pattern_string(input);
970}
971
972constexpr bool is_absolute_pathname(
973 std::string_view input, url_pattern_init::process_type type) noexcept {
974 // If input is the empty string, then return false.
975 if (input.empty()) [[unlikely]] {
976 return false;
977 }
978 // If input[0] is U+002F (/), then return true.
979 if (input.starts_with("/")) return true;
980 // If type is "url", then return false.
981 if (type == url_pattern_init::process_type::url) return false;
982 // If input's code point length is less than 2, then return false.
983 if (input.size() < 2) return false;
984 // If input[0] is U+005C (\‍) and input[1] is U+002F (/), then return true.
985 // If input[0] is U+007B ({) and input[1] is U+002F (/), then return true.
986 // Return false.
987 return input[1] == '/' && (input[0] == '\\' || input[0] == '{');
988}
989
990std::string generate_pattern_string(
991 std::vector<url_pattern_part>& part_list,
992 url_pattern_compile_component_options& options) {
993 // Let result be the empty string.
994 std::string result{};
995 // Let index list be the result of getting the indices for part list.
996 // For each index of index list:
997 for (size_t index = 0; index < part_list.size(); index++) {
998 // Let part be part list[index].
999 // Use reference to avoid copy
1000 const auto& part = part_list[index];
1001 // Let previous part be part list[index - 1] if index is greater than 0,
1002 // otherwise let it be null.
1003 // Use pointer to avoid copy
1004 const url_pattern_part* previous_part =
1005 index == 0 ? nullptr : &part_list[index - 1];
1006 // Let next part be part list[index + 1] if index is less than index list's
1007 // size - 1, otherwise let it be null.
1008 const url_pattern_part* next_part =
1009 index < part_list.size() - 1 ? &part_list[index + 1] : nullptr;
1010 // If part's type is "fixed-text" then:
1011 if (part.type == url_pattern_part_type::FIXED_TEXT) {
1012 // If part's modifier is "none" then:
1013 if (part.modifier == url_pattern_part_modifier::none) {
1014 // Append the result of running escape a pattern string given part's
1015 // value to the end of result.
1016 result.append(escape_pattern_string(part.value));
1017 continue;
1018 }
1019 // Append "{" to the end of result.
1020 result += "{";
1021 // Append the result of running escape a pattern string given part's value
1022 // to the end of result.
1023 result.append(escape_pattern_string(part.value));
1024 // Append "}" to the end of result.
1025 result += "}";
1026 // Append the result of running convert a modifier to a string given
1027 // part's modifier to the end of result.
1028 result.append(convert_modifier_to_string(part.modifier));
1029 continue;
1030 }
1031 // Let custom name be true if part's name[0] is not an ASCII digit;
1032 // otherwise false.
1033 bool custom_name = !unicode::is_ascii_digit(part.name[0]);
1034 // Let needs grouping be true if at least one of the following are true,
1035 // otherwise let it be false:
1036 // - part's suffix is not the empty string.
1037 // - part's prefix is not the empty string and is not options's prefix code
1038 // point.
1039 bool needs_grouping =
1040 !part.suffix.empty() ||
1041 (!part.prefix.empty() && !options.get_prefix().empty() &&
1042 part.prefix[0] != options.get_prefix()[0]);
1043
1044 // If all of the following are true:
1045 // - needs grouping is false; and
1046 // - custom name is true; and
1047 // - part's type is "segment-wildcard"; and
1048 // - part's modifier is "none"; and
1049 // - next part is not null; and
1050 // - next part's prefix is the empty string; and
1051 // - next part's suffix is the empty string
1052 if (!needs_grouping && custom_name &&
1053 part.type == url_pattern_part_type::SEGMENT_WILDCARD &&
1054 part.modifier == url_pattern_part_modifier::none && next_part &&
1055 next_part->prefix.empty() && next_part->suffix.empty()) {
1056 // If next part's type is "fixed-text":
1057 if (next_part->type == url_pattern_part_type::FIXED_TEXT) {
1058 // Set needs grouping to true if the result of running is a valid name
1059 // code point given next part's value's first code point and the boolean
1060 // false is true.
1061 if (idna::valid_name_code_point(next_part->value[0], false)) {
1062 needs_grouping = true;
1063 }
1064 } else {
1065 // Set needs grouping to true if next part's name[0] is an ASCII digit.
1066 needs_grouping = !next_part->name.empty() &&
1067 unicode::is_ascii_digit(next_part->name[0]);
1068 }
1069 }
1070
1071 // If all of the following are true:
1072 // - needs grouping is false; and
1073 // - part's prefix is the empty string; and
1074 // - previous part is not null; and
1075 // - previous part's type is "fixed-text"; and
1076 // - previous part's value's last code point is options's prefix code point.
1077 // then set needs grouping to true.
1078 if (!needs_grouping && part.prefix.empty() && previous_part &&
1079 previous_part->type == url_pattern_part_type::FIXED_TEXT &&
1080 !previous_part->value.empty() && !options.get_prefix().empty() &&
1081 previous_part->value.back() == options.get_prefix()[0]) {
1082 needs_grouping = true;
1083 }
1084
1085 // Assert: part's name is not the empty string or null.
1086 ADA_ASSERT_TRUE(!part.name.empty());
1087
1088 // If needs grouping is true, then append "{" to the end of result.
1089 if (needs_grouping) {
1090 result.append("{");
1091 }
1092
1093 // Append the result of running escape a pattern string given part's prefix
1094 // to the end of result.
1095 result.append(escape_pattern_string(part.prefix));
1096
1097 // If custom name is true:
1098 if (custom_name) {
1099 // Append ":" to the end of result.
1100 result.append(":");
1101 // Append part's name to the end of result.
1102 result.append(part.name);
1103 }
1104
1105 // If part's type is "regexp" then:
1106 if (part.type == url_pattern_part_type::REGEXP) {
1107 // Append "(" to the end of result.
1108 result.append("(");
1109 // Append part's value to the end of result.
1110 result.append(part.value);
1111 // Append ")" to the end of result.
1112 result.append(")");
1113 } else if (part.type == url_pattern_part_type::SEGMENT_WILDCARD &&
1114 !custom_name) {
1115 // Otherwise if part's type is "segment-wildcard" and custom name is
1116 // false: Append "(" to the end of result.
1117 result.append("(");
1118 // Append the result of running generate a segment wildcard regexp given
1119 // options to the end of result.
1120 result.append(generate_segment_wildcard_regexp(options));
1121 // Append ")" to the end of result.
1122 result.append(")");
1123 } else if (part.type == url_pattern_part_type::FULL_WILDCARD) {
1124 // Otherwise if part's type is "full-wildcard":
1125 // If custom name is false and one of the following is true:
1126 // - previous part is null; or
1127 // - previous part's type is "fixed-text"; or
1128 // - previous part's modifier is not "none"; or
1129 // - needs grouping is true; or
1130 // - part's prefix is not the empty string
1131 // - then append "*" to the end of result.
1132 if (!custom_name &&
1133 (!previous_part ||
1134 previous_part->type == url_pattern_part_type::FIXED_TEXT ||
1135 previous_part->modifier != url_pattern_part_modifier::none ||
1136 needs_grouping || !part.prefix.empty())) {
1137 result.append("*");
1138 } else {
1139 // Append "(" to the end of result.
1140 // Append full wildcard regexp value to the end of result.
1141 // Append ")" to the end of result.
1142 result.append("(.*)");
1143 }
1144 }
1145
1146 // If all of the following are true:
1147 // - part's type is "segment-wildcard"; and
1148 // - custom name is true; and
1149 // - part's suffix is not the empty string; and
1150 // - The result of running is a valid name code point given part's suffix's
1151 // first code point and the boolean false is true then append U+005C (\‍) to
1152 // the end of result.
1153 if (part.type == url_pattern_part_type::SEGMENT_WILDCARD && custom_name &&
1154 !part.suffix.empty() &&
1155 idna::valid_name_code_point(part.suffix[0], false)) {
1156 result.append("\\");
1157 }
1158
1159 // Append the result of running escape a pattern string given part's suffix
1160 // to the end of result.
1161 result.append(escape_pattern_string(part.suffix));
1162 // If needs grouping is true, then append "}" to the end of result.
1163 if (needs_grouping) result.append("}");
1164 // Append the result of running convert a modifier to a string given part's
1165 // modifier to the end of result.
1166 result.append(convert_modifier_to_string(part.modifier));
1167 }
1168 // Return result.
1169 return result;
1170}
1171} // namespace ada::url_pattern_helpers
1172
1173#endif // ADA_INCLUDE_URL_PATTERN
Declaration of the character sets used by unicode functions.
#define ADA_ASSERT_TRUE(COND)
Definitions for helper functions used within Ada.
bool constexpr is_ascii(std::u32string_view view)
const uint32_t table[8198][2]
Definition ada_idna.cpp:593
ada_really_inline size_t percent_encode_index(const std::string_view input, const uint8_t character_set[])
Definition unicode-inl.h:19
errors
Error codes for URL parsing operations.
Definition errors.h:17
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
URL scheme type definitions and utilities.
Definitions for all unicode specific functions.
Declaration for the URLPattern helpers.