2021-03-02 20:02:56 +09:00
|
|
|
module Twitter::TwitterText
|
|
|
|
class Configuration
|
|
|
|
def emoji_parsing_enabled
|
|
|
|
false
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2017-09-15 01:03:20 +09:00
|
|
|
class Regex
|
2019-02-10 04:13:11 +09:00
|
|
|
REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou
|
|
|
|
REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
|
2017-09-15 01:03:20 +09:00
|
|
|
REGEXEN[:valid_url_balanced_parens] = /
|
|
|
|
\(
|
|
|
|
(?:
|
|
|
|
#{REGEXEN[:valid_general_url_path_chars]}+
|
|
|
|
|
|
|
|
|
# allow one nested level of balanced parentheses
|
|
|
|
(?:
|
|
|
|
#{REGEXEN[:valid_general_url_path_chars]}*
|
|
|
|
\(
|
|
|
|
#{REGEXEN[:valid_general_url_path_chars]}+
|
|
|
|
\)
|
|
|
|
#{REGEXEN[:valid_general_url_path_chars]}*
|
|
|
|
)
|
|
|
|
)
|
|
|
|
\)
|
|
|
|
/iox
|
2021-07-15 22:56:58 +09:00
|
|
|
REGEXEN[:valid_iri_ucschar] = /[\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/iou
|
|
|
|
REGEXEN[:valid_iri_iprivate] = /[\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}]/iou
|
|
|
|
REGEXEN[:valid_url_query_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/iou
|
|
|
|
REGEXEN[:valid_url_query_ending_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9_&=#\/\-]/iou
|
2017-09-15 01:03:20 +09:00
|
|
|
REGEXEN[:valid_url_path] = /(?:
|
|
|
|
(?:
|
|
|
|
#{REGEXEN[:valid_general_url_path_chars]}*
|
|
|
|
(?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
|
|
|
|
#{REGEXEN[:valid_url_path_ending_chars]}
|
|
|
|
)|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
|
|
|
|
)/iox
|
|
|
|
REGEXEN[:valid_url] = %r{
|
|
|
|
( # $1 total match
|
2018-09-14 07:53:09 +09:00
|
|
|
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character
|
2017-09-15 01:03:20 +09:00
|
|
|
( # $3 URL
|
2020-10-20 00:02:13 +09:00
|
|
|
((?:https?|dat|dweb|ipfs|ipns|ssb|gopher|gemini):\/\/)? # $4 Protocol (optional)
|
2017-09-15 01:03:20 +09:00
|
|
|
(#{REGEXEN[:valid_domain]}) # $5 Domain(s)
|
|
|
|
(?::(#{REGEXEN[:valid_port_number]}))? # $6 Port number (optional)
|
|
|
|
(/#{REGEXEN[:valid_url_path]}*)? # $7 URL Path and anchor
|
|
|
|
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # $8 Query String
|
|
|
|
)
|
|
|
|
)
|
|
|
|
}iox
|
2020-01-11 10:15:25 +09:00
|
|
|
REGEXEN[:validate_nodeid] = /(?:
|
|
|
|
#{REGEXEN[:validate_url_unreserved]}|
|
|
|
|
#{REGEXEN[:validate_url_pct_encoded]}|
|
|
|
|
[!$()*+,;=]
|
|
|
|
)/iox
|
|
|
|
REGEXEN[:validate_resid] = /(?:
|
|
|
|
#{REGEXEN[:validate_url_unreserved]}|
|
|
|
|
#{REGEXEN[:validate_url_pct_encoded]}|
|
|
|
|
#{REGEXEN[:validate_url_sub_delims]}
|
|
|
|
)/iox
|
2020-01-24 05:27:26 +09:00
|
|
|
REGEXEN[:xmpp_uri] = %r{
|
|
|
|
(xmpp:) # Protocol
|
|
|
|
(//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)? # Authority (optional)
|
|
|
|
(#{REGEXEN[:validate_nodeid]}+@)? # Username in path (optional)
|
|
|
|
(#{REGEXEN[:valid_domain]}) # Domain in path
|
|
|
|
(/#{REGEXEN[:validate_resid]}+)? # Resource in path (optional)
|
|
|
|
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? # Query String
|
|
|
|
}iox
|
|
|
|
REGEXEN[:magnet_uri] = %r{
|
|
|
|
(magnet:) # Protocol
|
|
|
|
(\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]}) # Query String
|
|
|
|
}iox
|
|
|
|
REGEXEN[:valid_extended_uri] = %r{
|
|
|
|
( # $1 total match
|
|
|
|
(#{REGEXEN[:valid_url_preceding_chars]}) # $2 Preceding character
|
|
|
|
( # $3 URL
|
|
|
|
(#{REGEXEN[:xmpp_uri]}) | (#{REGEXEN[:magnet_uri]})
|
2020-01-11 10:15:25 +09:00
|
|
|
)
|
|
|
|
)
|
|
|
|
}iox
|
|
|
|
end
|
|
|
|
|
|
|
|
module Extractor
|
2020-01-24 05:27:26 +09:00
|
|
|
# Extracts a list of all XMPP and magnet URIs included in the Toot <tt>text</tt> along
|
2020-01-11 10:15:25 +09:00
|
|
|
# with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
|
2020-01-24 05:27:26 +09:00
|
|
|
# XMPP or magnet URIs an empty array will be returned.
|
2020-01-11 10:15:25 +09:00
|
|
|
#
|
|
|
|
# If a block is given then it will be called for each XMPP URI.
|
2021-02-01 05:26:09 +09:00
|
|
|
def extract_extra_uris_with_indices(text, _options = {}) # :yields: uri, start, end
|
2020-01-11 10:15:25 +09:00
|
|
|
return [] unless text && text.index(":")
|
|
|
|
urls = []
|
|
|
|
|
2021-03-02 20:02:56 +09:00
|
|
|
text.to_s.scan(Twitter::TwitterText::Regex[:valid_extended_uri]) do
|
2020-01-11 10:15:25 +09:00
|
|
|
valid_uri_match_data = $~
|
|
|
|
|
|
|
|
start_position = valid_uri_match_data.char_begin(3)
|
|
|
|
end_position = valid_uri_match_data.char_end(3)
|
|
|
|
|
|
|
|
urls << {
|
|
|
|
:url => valid_uri_match_data[3],
|
|
|
|
:indices => [start_position, end_position]
|
|
|
|
}
|
|
|
|
end
|
|
|
|
urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
|
|
|
|
urls
|
|
|
|
end
|
2017-09-15 01:03:20 +09:00
|
|
|
end
|
|
|
|
end
|