0
0
Fork 0

Do not pass unknown encoding names to nokogiri. (#30987)

This commit is contained in:
David Roetzel 2024-07-10 16:25:39 +02:00 committed by GitHub
parent 36592d10aa
commit 2ea9336b68
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 35 additions and 1 deletions

View file

@ -274,7 +274,7 @@ class LinkDetailsExtractor
end
def detect_encoding_and_parse_document
[detect_encoding, nil, @html_charset].uniq.each do |encoding|
[detect_encoding, nil, header_encoding].uniq.each do |encoding|
document = Nokogiri::HTML(@html, nil, encoding)
return document if document.to_s.valid_encoding?
end
@ -286,6 +286,13 @@ class LinkDetailsExtractor
guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
end
def header_encoding
Encoding.find(@html_charset).name if @html_charset
rescue ArgumentError
# Encoding from HTTP header is not recognized by ruby
nil
end
def detector
@detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector|
detector.strip_tags = true