ferron/util/
url_sanitizer.rs

1// Copyright (c) 2018-2025 SVR.JS
2// Portions of this file are derived from SVR.JS (https://git.svrjs.org/svrjs/svrjs).
3//
4// Permission is hereby granted, free of charge, to any person obtaining a copy
5// of this software and associated documentation files (the "Software"), to deal
6// in the Software without restriction, including without limitation the rights
7// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8// copies of the Software, and to permit persons to whom the Software is
9// furnished to do so, subject to the following conditions:
10//
11// The above copyright notice and this permission notice shall be included in all
12// copies or substantial portions of the Software.
13//
14use anyhow::{anyhow, Result};
15use std::str;
16
17pub fn sanitize_url(resource: &str, allow_double_slashes: bool) -> Result<String> {
18  if resource == "*" || resource.is_empty() {
19    return Ok(resource.to_string());
20  }
21
22  let mut sanitized = String::with_capacity(resource.len());
23
24  // Remove null bytes and handle initial sanitization
25  for &ch in resource.as_bytes() {
26    if ch != b'\0' {
27      sanitized.push(ch as char);
28    }
29  }
30
31  // Check for malformed URL encoding (invalid percent encoding)
32  let bytes = sanitized.as_bytes();
33  let mut i = 0;
34  while i < bytes.len() {
35    if bytes[i] == b'%' {
36      if i + 2 >= bytes.len() {
37        return Err(anyhow!("URI malformed"));
38      }
39      let hex = &bytes[i + 1..i + 3];
40      if !hex[0].is_ascii_hexdigit() || !hex[1].is_ascii_hexdigit() {
41        return Err(anyhow!("URI malformed"));
42      }
43      let value = u8::from_str_radix(str::from_utf8(hex)?, 16)?;
44      if value == 0xc0 || value == 0xc1 || value >= 0xfe {
45        return Err(anyhow!("URI malformed"));
46      }
47    }
48    i += 1;
49  }
50
51  // Decode percent-encoded characters while preserving safe ones
52  let mut decoded = String::with_capacity(sanitized.len());
53  let bytes = sanitized.as_bytes();
54  let mut i = 0;
55  while i < bytes.len() {
56    if bytes[i] == b'%' && i + 2 < bytes.len() {
57      let hex = &bytes[i + 1..i + 3];
58      if let Ok(value) = u8::from_str_radix(str::from_utf8(hex)?, 16) {
59        if value != 0 {
60          let decoded_char = value as char;
61          if decoded_char.is_ascii_alphanumeric()
62                        || "!$&'()*+,-./0123456789:;=@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]_abcdefghijklmnopqrstuvwxyz~"
63                            .contains(decoded_char)
64                    {
65                        decoded.push(decoded_char);
66                    } else {
67                        decoded.push('%');
68                        decoded.push(hex[0] as char);
69                        decoded.push(hex[1] as char);
70                    }
71          i += 2;
72        } else {
73          i += 3;
74          continue;
75        }
76      } else {
77        decoded.push('%');
78      }
79    } else {
80      decoded.push(bytes[i] as char);
81    }
82    i += 1;
83  }
84
85  // Encode unsafe characters
86  let mut encoded = String::with_capacity(decoded.len());
87  for ch in decoded.chars() {
88    match ch {
89      '<' | '>' | '^' | '`' | '{' | '|' | '}' => {
90        encoded.push_str(&format!("%{:02X}", ch as u8));
91      }
92      _ => encoded.push(ch),
93    }
94  }
95
96  // Ensure the resource starts with a slash
97  if !encoded.starts_with('/') {
98    encoded.insert(0, '/');
99  }
100
101  // Convert backslashes to slashes and handle duplicate slashes
102  let mut final_resource = String::with_capacity(encoded.len());
103  let mut last_was_slash = false;
104  for ch in encoded.chars() {
105    if ch == '\\' {
106      final_resource.push('/');
107      last_was_slash = true;
108    } else if ch == '/' {
109      if !allow_double_slashes && last_was_slash {
110        continue;
111      }
112      final_resource.push('/');
113      last_was_slash = true;
114    } else {
115      final_resource.push(ch);
116      last_was_slash = false;
117    }
118  }
119
120  // Normalize path segments (remove ".", "..", trailing dots)
121  let mut segments: Vec<&str> = Vec::new();
122  for mut part in final_resource.split('/') {
123    match part {
124      "." => continue,
125      ".." => {
126        segments.pop(); // Go up one directory
127      }
128      "" => {
129        if allow_double_slashes {
130          segments.push("");
131        }
132      }
133      _ => {
134        while part.ends_with('.') {
135          part = &part[..part.len() - 1];
136        }
137        if !part.is_empty() {
138          segments.push(part);
139        }
140      }
141    }
142  }
143
144  final_resource = if allow_double_slashes {
145    segments.join("/")
146  } else if !segments.is_empty() && final_resource.ends_with('/') {
147    format!("/{}/", segments.join("/"))
148  } else {
149    format!("/{}", segments.join("/"))
150  };
151
152  // Remove any remaining "/../" sequences
153  while final_resource.contains("/../") {
154    final_resource = final_resource.replacen("/../", "", 1);
155  }
156
157  // Ensure result is not empty
158  if final_resource.is_empty() {
159    final_resource.push('/');
160  }
161
162  Ok(final_resource)
163}
164
165// Path sanitizer tests taken from SVR.JS web server
166#[cfg(test)]
167mod tests {
168  use super::*;
169  use anyhow::Result;
170
171  #[test]
172  fn should_return_asterisk_for_asterisk() -> Result<()> {
173    assert_eq!(sanitize_url("*", false)?, "*");
174    Ok(())
175  }
176
177  #[test]
178  fn should_return_empty_string_for_empty_string() -> Result<()> {
179    assert_eq!(sanitize_url("", false)?, "");
180    Ok(())
181  }
182
183  #[test]
184  fn should_remove_null_characters() -> Result<()> {
185    assert_eq!(sanitize_url("/test%00", false)?, "/test");
186    assert_eq!(sanitize_url("/test\0", false)?, "/test");
187    Ok(())
188  }
189
190  #[test]
191  fn should_throw_uri_error_for_malformed_url() {
192    assert!(sanitize_url("%c0%af", false).is_err());
193    assert!(sanitize_url("%u002f", false).is_err());
194    assert!(sanitize_url("%as", false).is_err());
195  }
196
197  #[test]
198  fn should_ensure_the_resource_starts_with_a_slash() -> Result<()> {
199    assert_eq!(sanitize_url("test", false)?, "/test");
200    Ok(())
201  }
202
203  #[test]
204  fn should_convert_backslashes_to_slashes() -> Result<()> {
205    assert_eq!(sanitize_url("test\\path", false)?, "/test/path");
206    Ok(())
207  }
208
209  #[test]
210  fn should_handle_duplicate_slashes() -> Result<()> {
211    assert_eq!(sanitize_url("test//path", false)?, "/test/path");
212    assert_eq!(sanitize_url("test//path", true)?, "/test//path");
213    Ok(())
214  }
215
216  #[test]
217  fn should_handle_relative_navigation() -> Result<()> {
218    assert_eq!(sanitize_url("/./test", false)?, "/test");
219    assert_eq!(sanitize_url("/../test", false)?, "/test");
220    assert_eq!(sanitize_url("../test", false)?, "/test");
221    assert_eq!(sanitize_url("./test", false)?, "/test");
222    assert_eq!(sanitize_url("/test/./", false)?, "/test/");
223    assert_eq!(sanitize_url("/test/../", false)?, "/");
224    assert_eq!(sanitize_url("/test/../path", false)?, "/path");
225    Ok(())
226  }
227
228  #[test]
229  fn should_remove_trailing_dots_in_paths() -> Result<()> {
230    assert_eq!(sanitize_url("/test...", false)?, "/test");
231    assert_eq!(sanitize_url("/test.../", false)?, "/test/");
232    Ok(())
233  }
234
235  #[test]
236  fn should_return_slash_for_empty_sanitized_resource() -> Result<()> {
237    assert_eq!(sanitize_url("/../..", false)?, "/");
238    Ok(())
239  }
240
241  #[test]
242  fn should_encode_special_characters() -> Result<()> {
243    assert_eq!(sanitize_url("/test<path>", false)?, "/test%3Cpath%3E");
244    assert_eq!(sanitize_url("/test^path", false)?, "/test%5Epath");
245    assert_eq!(sanitize_url("/test`path", false)?, "/test%60path");
246    assert_eq!(sanitize_url("/test{path}", false)?, "/test%7Bpath%7D");
247    assert_eq!(sanitize_url("/test|path", false)?, "/test%7Cpath");
248    Ok(())
249  }
250
251  #[test]
252  fn should_preserve_certain_characters() -> Result<()> {
253    assert_eq!(sanitize_url("/test!path", false)?, "/test!path");
254    assert_eq!(sanitize_url("/test$path", false)?, "/test$path");
255    assert_eq!(sanitize_url("/test&path", false)?, "/test&path");
256    assert_eq!(sanitize_url("/test-path", false)?, "/test-path");
257    assert_eq!(sanitize_url("/test=path", false)?, "/test=path");
258    assert_eq!(sanitize_url("/test@path", false)?, "/test@path");
259    assert_eq!(sanitize_url("/test_path", false)?, "/test_path");
260    assert_eq!(sanitize_url("/test~path", false)?, "/test~path");
261    Ok(())
262  }
263
264  #[test]
265  fn should_decode_url_encoded_characters_while_preserving_certain_characters() -> Result<()> {
266    assert_eq!(sanitize_url("/test%20path", false)?, "/test%20path");
267    assert_eq!(sanitize_url("/test%21path", false)?, "/test!path");
268    assert_eq!(sanitize_url("/test%22path", false)?, "/test%22path");
269    assert_eq!(sanitize_url("/test%24path", false)?, "/test$path");
270    assert_eq!(sanitize_url("/test%25path", false)?, "/test%25path");
271    assert_eq!(sanitize_url("/test%26path", false)?, "/test&path");
272    assert_eq!(sanitize_url("/test%2Dpath", false)?, "/test-path");
273    assert_eq!(sanitize_url("/test%3Cpath", false)?, "/test%3Cpath");
274    assert_eq!(sanitize_url("/test%3Dpath", false)?, "/test=path");
275    assert_eq!(sanitize_url("/test%3Epath", false)?, "/test%3Epath");
276    assert_eq!(sanitize_url("/test%40path", false)?, "/test@path");
277    assert_eq!(sanitize_url("/test%5Fpath", false)?, "/test_path");
278    assert_eq!(sanitize_url("/test%7Dpath", false)?, "/test%7Dpath");
279    assert_eq!(sanitize_url("/test%7Epath", false)?, "/test~path");
280    Ok(())
281  }
282
283  #[test]
284  fn should_decode_url_encoded_alphanumeric_characters_while_preserving_certain_characters(
285  ) -> Result<()> {
286    assert_eq!(sanitize_url("/conf%69g.json", false)?, "/config.json");
287    assert_eq!(sanitize_url("/CONF%49G.JSON", false)?, "/CONFIG.JSON");
288    assert_eq!(sanitize_url("/svr%32.js", false)?, "/svr2.js");
289    assert_eq!(sanitize_url("/%73%76%72%32%2E%6A%73", false)?, "/svr2.js");
290    Ok(())
291  }
292
293  #[test]
294  fn should_decode_url_encoded_characters_regardless_of_the_letter_case_of_the_url_encoding(
295  ) -> Result<()> {
296    assert_eq!(sanitize_url("/%5f", false)?, "/_");
297    assert_eq!(sanitize_url("/%5F", false)?, "/_");
298    Ok(())
299  }
300}