Remove usernames and hashtags from language detection (#3503)
* Add failing specs for hashtag and username extraction in language detector * Remove usernames and hashtags from text before language detection * Handle multiple instances of special case, and reduce whitespace
This commit is contained in:
parent
d1e08bd38c
commit
d010e270e6
2 changed files with 47 additions and 2 deletions
|
@ -13,6 +13,10 @@ class LanguageDetector
|
|||
detected_language_code || default_locale.to_sym
|
||||
end
|
||||
|
||||
def prepared_text
|
||||
simplified_text.strip
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def detected_language_code
|
||||
|
@ -20,18 +24,21 @@ class LanguageDetector
|
|||
end
|
||||
|
||||
def result
|
||||
@result ||= @identifier.find_language(text_without_urls)
|
||||
@result ||= @identifier.find_language(prepared_text)
|
||||
end
|
||||
|
||||
def detected_language_reliable?
|
||||
result.reliable?
|
||||
end
|
||||
|
||||
def text_without_urls
|
||||
def simplified_text
|
||||
text.dup.tap do |new_text|
|
||||
URI.extract(new_text).each do |url|
|
||||
new_text.gsub!(url, '')
|
||||
end
|
||||
new_text.gsub!(Account::MENTION_RE, '')
|
||||
new_text.gsub!(Tag::HASHTAG_RE, '')
|
||||
new_text.gsub!(/\s+/, ' ')
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue