twitter_regex.rb 4.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. # frozen_string_literal: true
  2. module Twitter::TwitterText
  3. class Configuration
  4. def emoji_parsing_enabled
  5. false
  6. end
  7. end
  8. class Regex
  9. REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>()?]/iou
  10. REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}()?!*"'「」<>;:=,.$%\[\]~&|]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
  11. REGEXEN[:valid_url_balanced_parens] = /
  12. \(
  13. (?:
  14. #{REGEXEN[:valid_general_url_path_chars]}+
  15. |
  16. # allow one nested level of balanced parentheses
  17. (?:
  18. #{REGEXEN[:valid_general_url_path_chars]}*
  19. \(
  20. #{REGEXEN[:valid_general_url_path_chars]}+
  21. \)
  22. #{REGEXEN[:valid_general_url_path_chars]}*
  23. )
  24. )
  25. \)
  26. /iox
  27. # rubocop:disable Layout/LineLength
  28. UCHARS = '\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}'
  29. # rubocop:enable Layout/LineLength
  30. REGEXEN[:valid_url_query_chars] = %r{[a-z0-9!?*'();:&=+$/%#\[\]\-_.,~|@\^#{UCHARS}]}iou
  31. REGEXEN[:valid_url_query_ending_chars] = %r{[a-z0-9_&=#/\-#{UCHARS}]}iou
  32. REGEXEN[:valid_url_path] = %r{(?:
  33. (?:
  34. #{REGEXEN[:valid_general_url_path_chars]}*
  35. (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
  36. #{REGEXEN[:valid_url_path_ending_chars]}
  37. )|(?:#{REGEXEN[:valid_general_url_path_chars]}+/)
  38. )}iox
  39. REGEXEN[:valid_url] = %r{
  40. ( # $1 total match
  41. (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character
  42. ( # $3 URL
  43. ((?:https?|dat|dweb|ipfs|ipns|ssb|gopher|gemini)://)? # $4 Protocol (optional)
  44. (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
  45. (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
  46. (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
  47. (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
  48. )
  49. )
  50. }iox
  51. REGEXEN[:validate_nodeid] = /(?:
  52. #{REGEXEN[:validate_url_unreserved]}|
  53. #{REGEXEN[:validate_url_pct_encoded]}|
  54. [!$()*+,;=]
  55. )/iox
  56. REGEXEN[:validate_resid] = /(?:
  57. #{REGEXEN[:validate_url_unreserved]}|
  58. #{REGEXEN[:validate_url_pct_encoded]}|
  59. #{REGEXEN[:validate_url_sub_delims]}
  60. )/iox
  61. REGEXEN[:valid_extended_uri] = %r{
  62. ( # $1 total match
  63. (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character
  64. ( # $3 URL
  65. (
  66. (xmpp:) # Protocol
  67. (//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)? # Authority (optional)
  68. (#{REGEXEN[:validate_nodeid]}+@)? # Username in path (optional)
  69. (#{REGEXEN[:valid_domain]}) # Domain in path
  70. (/#{REGEXEN[:validate_resid]}+)? # Resource in path (optional)
  71. (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # Query String
  72. ) | (
  73. (magnet:) # Protocol
  74. (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]}) # Query String
  75. )
  76. )
  77. )
  78. }iox
  79. end
  80. end