twitter_regex.rb 4.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. module Twitter
  2. class Regex
  3. REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
  4. REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
  5. REGEXEN[:valid_url_balanced_parens] = /
  6. \(
  7. (?:
  8. #{REGEXEN[:valid_general_url_path_chars]}+
  9. |
  10. # allow one nested level of balanced parentheses
  11. (?:
  12. #{REGEXEN[:valid_general_url_path_chars]}*
  13. \(
  14. #{REGEXEN[:valid_general_url_path_chars]}+
  15. \)
  16. #{REGEXEN[:valid_general_url_path_chars]}*
  17. )
  18. )
  19. \)
  20. /iox
  21. REGEXEN[:valid_url_path] = /(?:
  22. (?:
  23. #{REGEXEN[:valid_general_url_path_chars]}*
  24. (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
  25. #{REGEXEN[:valid_url_path_ending_chars]}
  26. )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
  27. )/iox
  28. REGEXEN[:valid_url] = %r{
  29. ( # $1 total match
  30. (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character
  31. ( # $3 URL
  32. ((?:https?|dat|dweb|ipfs|ipns|ssb|gopher):\/\/)? # $4 Protocol (optional)
  33. (#{REGEXEN[:valid_domain]}) # $5 Domain(s)
  34. (?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
  35. (/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
  36. (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
  37. )
  38. )
  39. }iox
  40. REGEXEN[:validate_nodeid] = /(?:
  41. #{REGEXEN[:validate_url_unreserved]}|
  42. #{REGEXEN[:validate_url_pct_encoded]}|
  43. [!$()*+,;=]
  44. )/iox
  45. REGEXEN[:validate_resid] = /(?:
  46. #{REGEXEN[:validate_url_unreserved]}|
  47. #{REGEXEN[:validate_url_pct_encoded]}|
  48. #{REGEXEN[:validate_url_sub_delims]}
  49. )/iox
  50. REGEXEN[:xmpp_uri] = %r{
  51. (xmpp:) # Protocol
  52. (//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)? # Authority (optional)
  53. (#{REGEXEN[:validate_nodeid]}+@)? # Username in path (optional)
  54. (#{REGEXEN[:valid_domain]}) # Domain in path
  55. (/#{REGEXEN[:validate_resid]}+)? # Resource in path (optional)
  56. (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # Query String
  57. }iox
  58. REGEXEN[:magnet_uri] = %r{
  59. (magnet:) # Protocol
  60. (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]}) # Query String
  61. }iox
  62. REGEXEN[:valid_extended_uri] = %r{
  63. ( # $1 total match
  64. (#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character
  65. ( # $3 URL
  66. (#{REGEXEN[:xmpp_uri]}) | (#{REGEXEN[:magnet_uri]})
  67. )
  68. )
  69. }iox
  70. end
  71. module Extractor
  72. # Extracts a list of all XMPP and magnet URIs included in the Toot <tt>text</tt> along
  73. # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
  74. # XMPP or magnet URIs an empty array will be returned.
  75. #
  76. # If a block is given then it will be called for each XMPP URI.
  77. def extract_extra_uris_with_indices(text, options = {}) # :yields: uri, start, end
  78. return [] unless text && text.index(":")
  79. urls = []
  80. text.to_s.scan(Twitter::Regex[:valid_extended_uri]) do
  81. valid_uri_match_data = $~
  82. start_position = valid_uri_match_data.char_begin(3)
  83. end_position = valid_uri_match_data.char_end(3)
  84. urls << {
  85. :url => valid_uri_match_data[3],
  86. :indices => [start_position, end_position]
  87. }
  88. end
  89. urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
  90. urls
  91. end
  92. end
  93. end