001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.openstreetmap.josm.data.validation.routines; 018 019import static org.openstreetmap.josm.tools.I18n.tr; 020 021import java.net.URI; 022import java.net.URISyntaxException; 023import java.util.Collections; 024import java.util.HashSet; 025import java.util.Locale; 026import java.util.Optional; 027import java.util.Set; 028import java.util.regex.Matcher; 029import java.util.regex.Pattern; 030 031import org.openstreetmap.josm.tools.Logging; 032 033/** 034 * <p><b>URL Validation</b> routines.</p> 035 * Behavior of validation is modified by passing in options: 036 * <ul> 037 * <li>ALLOW_2_SLASHES - [FALSE] Allows double '/' characters in the path 038 * component.</li> 039 * <li>NO_FRAGMENT- [FALSE] By default fragments are allowed, if this option is 040 * included then fragments are flagged as illegal.</li> 041 * <li>ALLOW_ALL_SCHEMES - [FALSE] By default only http, https, and ftp are 042 * considered valid schemes. Enabling this option will let any scheme pass validation.</li> 043 * </ul> 044 * 045 * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02, 046 * http://javascript.internet.com. However, this validation now bears little resemblance 047 * to the php original.</p> 048 * <pre> 049 * Example of usage: 050 * Construct a UrlValidator with valid schemes of "http", and "https". 051 * 052 * String[] schemes = {"http","https"}. 053 * UrlValidator urlValidator = new UrlValidator(schemes); 054 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 055 * System.out.println("url is valid"); 056 * } else { 057 * System.out.println("url is invalid"); 058 * } 059 * 060 * prints "url is invalid" 061 * If instead the default constructor is used. 062 * 063 * UrlValidator urlValidator = new UrlValidator(); 064 * if (urlValidator.isValid("ftp://foo.bar.com/")) { 065 * System.out.println("url is valid"); 066 * } else { 067 * System.out.println("url is invalid"); 068 * } 069 * 070 * prints out "url is valid" 071 * </pre> 072 * 073 * @version $Revision: 1741724 $ 074 * @see 075 * <a href="http://www.ietf.org/rfc/rfc2396.txt"> 076 * Uniform Resource Identifiers (URI): Generic Syntax 077 * </a> 078 * 079 * @since Validator 1.4 080 */ 081public class UrlValidator extends AbstractValidator { 082 083 /** 084 * Allows all validly formatted schemes to pass validation instead of 085 * supplying a set of valid schemes. 086 */ 087 public static final long ALLOW_ALL_SCHEMES = 1 << 0; 088 089 /** 090 * Allow two slashes in the path component of the URL. 091 */ 092 public static final long ALLOW_2_SLASHES = 1 << 1; 093 094 /** 095 * Enabling this options disallows any URL fragments. 096 */ 097 public static final long NO_FRAGMENTS = 1 << 2; 098 099 /** 100 * Allow local URLs, such as http://localhost/ or http://machine/ . 101 * This enables a broad-brush check, for complex local machine name 102 * validation requirements you should create your validator with 103 * a {@link RegexValidator} instead ({@link #UrlValidator(RegexValidator, long)}) 104 */ 105 public static final long ALLOW_LOCAL_URLS = 1 << 3; // CHECKSTYLE IGNORE MagicNumber 106 107 /** 108 * This expression derived/taken from the BNF for URI (RFC2396). 109 */ 110 private static final String URL_REGEX = 111 "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"; 112 // 12 3 4 5 6 7 8 9 113 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEX); 114 115 /** 116 * Schema/Protocol (ie. http:, ftp:, file:, etc). 117 */ 118 private static final int PARSE_URL_SCHEME = 2; 119 120 /** 121 * Includes hostname/ip and port number. 122 */ 123 private static final int PARSE_URL_AUTHORITY = 4; 124 125 private static final int PARSE_URL_PATH = 5; 126 127 private static final int PARSE_URL_QUERY = 7; 128 129 private static final int PARSE_URL_FRAGMENT = 9; 130 131 /** 132 * Protocol scheme (e.g. http, ftp, https). 133 */ 134 private static final String SCHEME_REGEX = "^\\p{Alpha}[\\p{Alnum}\\+\\-\\.]*"; 135 private static final Pattern SCHEME_PATTERN = Pattern.compile(SCHEME_REGEX); 136 137 // Drop numeric, and "+-." for now 138 // TODO does not allow for optional userinfo. 139 // Validation of character set is done by isValidAuthority 140 private static final String AUTHORITY_CHARS_REGEX = "\\p{Alnum}\\-\\."; // allows for IPV4 but not IPV6 141 private static final String IPV6_REGEX = "[0-9a-fA-F:]+"; // do this as separate match because : could cause ambiguity with port prefix 142 143 // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) 144 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" 145 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 146 // We assume that password has the same valid chars as user info 147 private static final String USERINFO_CHARS_REGEX = "[a-zA-Z0-9%-._~!$&'()*+,;=]"; 148 // since neither ':' nor '@' are allowed chars, we don't need to use non-greedy matching 149 private static final String USERINFO_FIELD_REGEX = 150 USERINFO_CHARS_REGEX + "+:" + // At least one character for the name 151 USERINFO_CHARS_REGEX + "*@"; // password may be absent 152 private static final String AUTHORITY_REGEX = 153 "(?:\\[("+IPV6_REGEX+")\\]|(?:(?:"+USERINFO_FIELD_REGEX+")?([" + AUTHORITY_CHARS_REGEX + "]*)))(:\\d*)?(.*)?"; 154 // 1 e.g. user:pass@ 2 3 4 155 private static final Pattern AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEX); 156 157 private static final int PARSE_AUTHORITY_IPV6 = 1; 158 159 private static final int PARSE_AUTHORITY_HOST_IP = 2; // excludes userinfo, if present 160 161 /** 162 * Should always be empty. The code currently allows spaces. 163 */ 164 private static final int PARSE_AUTHORITY_EXTRA = 4; 165 166 private static final String PATH_REGEX = "^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"; 167 private static final Pattern PATH_PATTERN = Pattern.compile(PATH_REGEX); 168 169 private static final String QUERY_REGEX = "^(.*)$"; 170 private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX); 171 172 /** 173 * Holds the set of current validation options. 174 */ 175 private final long options; 176 177 /** 178 * The set of schemes that are allowed to be in a URL. 179 */ 180 private final Set<String> allowedSchemes; // Must be lower-case 181 182 /** 183 * Regular expressions used to manually validate authorities if IANA 184 * domain name validation isn't desired. 185 */ 186 private final RegexValidator authorityValidator; 187 188 /** 189 * If no schemes are provided, default to this set. 190 */ 191 private static final String[] DEFAULT_SCHEMES = {"http", "https", "ftp"}; // Must be lower-case 192 193 /** 194 * Singleton instance of this class with default schemes and options. 195 */ 196 private static final UrlValidator DEFAULT_URL_VALIDATOR = new UrlValidator(UrlValidator.ALLOW_2_SLASHES); 197 198 /** 199 * Returns the singleton instance of this class with default schemes and options. 200 * @return singleton instance with default schemes and options 201 */ 202 public static UrlValidator getInstance() { 203 return DEFAULT_URL_VALIDATOR; 204 } 205 206 /** 207 * Create a UrlValidator with default properties. 208 */ 209 public UrlValidator() { 210 this((String[]) null); 211 } 212 213 /** 214 * Behavior of validation is modified by passing in several strings options: 215 * @param schemes Pass in one or more url schemes to consider valid, passing in 216 * a null will default to "http,https,ftp" being valid. 217 * If a non-null schemes is specified then all valid schemes must 218 * be specified. Setting the ALLOW_ALL_SCHEMES option will 219 * ignore the contents of schemes. 220 */ 221 public UrlValidator(String... schemes) { 222 this(schemes, 0L); 223 } 224 225 /** 226 * Initialize a UrlValidator with the given validation options. 227 * @param options The options should be set using the public constants declared in 228 * this class. To set multiple options you simply add them together. For example, 229 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 230 */ 231 public UrlValidator(long options) { 232 this(null, null, options); 233 } 234 235 /** 236 * Behavior of validation is modified by passing in options: 237 * @param schemes The set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 238 * @param options The options should be set using the public constants declared in 239 * this class. To set multiple options you simply add them together. For example, 240 * ALLOW_2_SLASHES + NO_FRAGMENTS enables both of those options. 241 */ 242 public UrlValidator(String[] schemes, long options) { 243 this(schemes, null, options); 244 } 245 246 /** 247 * Initialize a UrlValidator with the given validation options. 248 * @param authorityValidator Regular expression validator used to validate the authority part 249 * This allows the user to override the standard set of domains. 250 * @param options Validation options. Set using the public constants of this class. 251 * To set multiple options, simply add them together: 252 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 253 * enables both of those options. 254 */ 255 public UrlValidator(RegexValidator authorityValidator, long options) { 256 this(null, authorityValidator, options); 257 } 258 259 /** 260 * Customizable constructor. Validation behavior is modifed by passing in options. 261 * @param schemes the set of valid schemes. Ignored if the ALLOW_ALL_SCHEMES option is set. 262 * @param authorityValidator Regular expression validator used to validate the authority part 263 * @param options Validation options. Set using the public constants of this class. 264 * To set multiple options, simply add them together: 265 * <p><code>ALLOW_2_SLASHES + NO_FRAGMENTS</code></p> 266 * enables both of those options. 267 */ 268 public UrlValidator(String[] schemes, RegexValidator authorityValidator, long options) { 269 this.options = options; 270 271 if (isOn(ALLOW_ALL_SCHEMES)) { 272 allowedSchemes = Collections.emptySet(); 273 } else { 274 if (schemes == null) { 275 schemes = DEFAULT_SCHEMES; 276 } 277 allowedSchemes = new HashSet<>(schemes.length); 278 for (String scheme : schemes) { 279 allowedSchemes.add(scheme.toLowerCase(Locale.ENGLISH)); 280 } 281 } 282 283 this.authorityValidator = authorityValidator; 284 } 285 286 /** 287 * <p>Checks if a field has a valid url address.</p> 288 * 289 * Note that the method calls #isValidAuthority() 290 * which checks that the domain is valid. 291 * 292 * @param value The value validation is being performed on. A <code>null</code> 293 * value is considered invalid. 294 * @return true if the url is valid. 295 */ 296 @Override 297 public boolean isValid(String value) { 298 if (value == null) { 299 return false; 300 } 301 302 // Check the whole url address structure 303 Matcher urlMatcher = URL_PATTERN.matcher(value); 304 if (!urlMatcher.matches()) { 305 setErrorMessage(tr("URL is invalid")); 306 return false; 307 } 308 309 String scheme = urlMatcher.group(PARSE_URL_SCHEME); 310 if (!isValidScheme(scheme)) { 311 setErrorMessage(tr("URL contains an invalid protocol: {0}", scheme)); 312 return false; 313 } 314 315 String authority = urlMatcher.group(PARSE_URL_AUTHORITY); 316 if ("file".equals(scheme)) { // Special case - file: allows an empty authority 317 if (!"".equals(authority) && authority.contains(":")) { // but cannot allow trailing : 318 setErrorMessage(tr("URL contains an invalid authority: {0}", authority)); 319 return false; 320 } 321 // drop through to continue validation 322 } else { // not file: 323 // Validate the authority 324 if (!isValidAuthority(authority)) { 325 setErrorMessage(tr("URL contains an invalid authority: {0}", authority)); 326 return false; 327 } 328 } 329 330 String path = urlMatcher.group(PARSE_URL_PATH); 331 if (!isValidPath(path)) { 332 setErrorMessage(tr("URL contains an invalid path: {0}", path)); 333 return false; 334 } 335 336 String query = urlMatcher.group(PARSE_URL_QUERY); 337 if (!isValidQuery(query)) { 338 setErrorMessage(tr("URL contains an invalid query: {0}", query)); 339 return false; 340 } 341 342 String fragment = urlMatcher.group(PARSE_URL_FRAGMENT); 343 if (!isValidFragment(fragment)) { 344 setErrorMessage(tr("URL contains an invalid fragment: {0}", fragment)); 345 return false; 346 } 347 348 return true; 349 } 350 351 @Override 352 public String getValidatorName() { 353 return tr("URL validator"); 354 } 355 356 /** 357 * Validate scheme. If schemes[] was initialized to a non null, 358 * then only those schemes are allowed. 359 * Otherwise the default schemes are "http", "https", "ftp". 360 * Matching is case-blind. 361 * @param scheme The scheme to validate. A <code>null</code> value is considered 362 * invalid. 363 * @return true if valid. 364 */ 365 protected boolean isValidScheme(String scheme) { 366 if (scheme == null) { 367 return false; 368 } 369 370 // TODO could be removed if external schemes were checked in the ctor before being stored 371 if (!SCHEME_PATTERN.matcher(scheme).matches()) { 372 return false; 373 } 374 375 return isOn(ALLOW_ALL_SCHEMES) || allowedSchemes.contains(scheme.toLowerCase(Locale.ENGLISH)); 376 } 377 378 /** 379 * Returns true if the authority is properly formatted. An authority is the combination 380 * of hostname and port. A <code>null</code> authority value is considered invalid. 381 * Note: this implementation validates the domain unless a RegexValidator was provided. 382 * If a RegexValidator was supplied and it matches, then the authority is regarded 383 * as valid with no further checks, otherwise the method checks against the 384 * AUTHORITY_PATTERN and the DomainValidator (ALLOW_LOCAL_URLS) 385 * @param authority Authority value to validate, alllows IDN 386 * @return true if authority (hostname and port) is valid. 387 */ 388 protected boolean isValidAuthority(String authority) { 389 if (authority == null) { 390 return false; 391 } 392 393 // check manual authority validation if specified 394 if (authorityValidator != null && authorityValidator.isValid(authority)) { 395 return true; 396 } 397 // convert to ASCII if possible 398 final String authorityASCII = DomainValidator.unicodeToASCII(authority); 399 400 Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authorityASCII); 401 if (!authorityMatcher.matches()) { 402 return false; 403 } 404 405 // We have to process IPV6 separately because that is parsed in a different group 406 String ipv6 = authorityMatcher.group(PARSE_AUTHORITY_IPV6); 407 if (ipv6 != null) { 408 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 409 if (!inetAddressValidator.isValidInet6Address(ipv6)) { 410 return false; 411 } 412 } else { 413 String hostLocation = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); 414 // check if authority is hostname or IP address: 415 // try a hostname first since that's much more likely 416 DomainValidator domainValidator = DomainValidator.getInstance(isOn(ALLOW_LOCAL_URLS)); 417 if (!domainValidator.isValid(hostLocation)) { 418 // try an IPv4 address 419 InetAddressValidator inetAddressValidator = InetAddressValidator.getInstance(); 420 if (!inetAddressValidator.isValidInet4Address(hostLocation)) { 421 // isn't IPv4, so the URL is invalid 422 return false; 423 } 424 } 425 } 426 427 return Optional.ofNullable(authorityMatcher.group(PARSE_AUTHORITY_EXTRA)).orElse("").trim().isEmpty(); 428 } 429 430 /** 431 * Returns true if the path is valid. A <code>null</code> value is considered invalid. 432 * @param path Path value to validate. 433 * @return true if path is valid. 434 */ 435 protected boolean isValidPath(String path) { 436 if (path == null) { 437 return false; 438 } 439 440 if (!PATH_PATTERN.matcher(path).matches()) { 441 return false; 442 } 443 444 try { 445 URI uri = new URI(null, null, path, null); 446 String norm = uri.normalize().getPath(); 447 if (norm.startsWith("/../") // Trying to go via the parent dir 448 || "/..".equals(norm)) { // Trying to go to the parent dir 449 return false; 450 } 451 } catch (URISyntaxException e) { 452 Logging.trace(e); 453 return false; 454 } 455 456 return isOn(ALLOW_2_SLASHES) || countToken("//", path) <= 0; 457 } 458 459 /** 460 * Returns true if the query is null or it's a properly formatted query string. 461 * @param query Query value to validate. 462 * @return true if query is valid. 463 */ 464 protected boolean isValidQuery(String query) { 465 if (query == null) { 466 return true; 467 } 468 469 return QUERY_PATTERN.matcher(query).matches(); 470 } 471 472 /** 473 * Returns true if the given fragment is null or fragments are allowed. 474 * @param fragment Fragment value to validate. 475 * @return true if fragment is valid. 476 */ 477 protected boolean isValidFragment(String fragment) { 478 if (fragment == null) { 479 return true; 480 } 481 482 return isOff(NO_FRAGMENTS); 483 } 484 485 /** 486 * Returns the number of times the token appears in the target. 487 * @param token Token value to be counted. 488 * @param target Target value to count tokens in. 489 * @return the number of tokens. 490 */ 491 protected int countToken(String token, String target) { 492 int tokenIndex = 0; 493 int count = 0; 494 while (tokenIndex != -1) { 495 tokenIndex = target.indexOf(token, tokenIndex); 496 if (tokenIndex > -1) { 497 tokenIndex++; 498 count++; 499 } 500 } 501 return count; 502 } 503 504 /** 505 * Tests whether the given flag is on. If the flag is not a power of 2 506 * (ie. 3) this tests whether the combination of flags is on. 507 * 508 * @param flag Flag value to check. 509 * 510 * @return whether the specified flag value is on. 511 */ 512 private boolean isOn(long flag) { 513 return (options & flag) > 0; 514 } 515 516 /** 517 * Tests whether the given flag is off. If the flag is not a power of 2 518 * (ie. 3) this tests whether the combination of flags is off. 519 * 520 * @param flag Flag value to check. 521 * 522 * @return whether the specified flag value is off. 523 */ 524 private boolean isOff(long flag) { 525 return (options & flag) == 0; 526 } 527}