0
0
Fork 0

Add Fetch All Replies Part 1: Backend (#32615)

Signed-off-by: sneakers-the-rat <sneakers-the-rat@protonmail.com>
Co-authored-by: jonny <j@nny.fyi>
Co-authored-by: Claire <claire.github-309c@sitedethib.com>
Co-authored-by: Kouhai <66407198+kouhaidev@users.noreply.github.com>
This commit is contained in:
Jonny Saunders 2025-03-12 02:03:01 -07:00 committed by GitHub
parent 2fe7172002
commit 46e13dd81c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 874 additions and 25 deletions

View file

@ -58,6 +58,8 @@ class Api::V1::StatusesController < Api::BaseController
statuses = [@status] + @context.ancestors + @context.descendants
render json: @context, serializer: REST::ContextSerializer, relationships: StatusRelationshipsPresenter.new(statuses, current_user&.account_id)
ActivityPub::FetchAllRepliesWorker.perform_async(@status.id) if !current_account.nil? && @status.should_fetch_replies?
end
def create

View file

@ -155,24 +155,49 @@ module JsonLdHelper
end
end
def fetch_resource(uri, id_is_known, on_behalf_of = nil, request_options: {})
# Fetch the resource given by uri.
# @param uri [String]
# @param id_is_known [Boolean]
# @param on_behalf_of [nil, Account]
# @param raise_on_error [Boolean, Symbol<:all, :temporary>] See {#fetch_resource_without_id_validation} for possible values
def fetch_resource(uri, id_is_known, on_behalf_of = nil, raise_on_error: false, request_options: {})
unless id_is_known
json = fetch_resource_without_id_validation(uri, on_behalf_of)
json = fetch_resource_without_id_validation(uri, on_behalf_of, raise_on_error: raise_on_error)
return if !json.is_a?(Hash) || unsupported_uri_scheme?(json['id'])
uri = json['id']
end
json = fetch_resource_without_id_validation(uri, on_behalf_of, request_options: request_options)
json = fetch_resource_without_id_validation(uri, on_behalf_of, raise_on_error: raise_on_error, request_options: request_options)
json.present? && json['id'] == uri ? json : nil
end
def fetch_resource_without_id_validation(uri, on_behalf_of = nil, raise_on_temporary_error = false, request_options: {})
# Fetch the resource given by uri
#
# If an error is raised, it contains the response and can be captured for handling like
#
# begin
# fetch_resource_without_id_validation(uri, nil, true)
# rescue Mastodon::UnexpectedResponseError => e
# e.response
# end
#
# @param uri [String]
# @param on_behalf_of [nil, Account]
# @param raise_on_error [Boolean, Symbol<:all, :temporary>]
# - +true+, +:all+ - raise if response code is not in the 2** range
# - +:temporary+ - raise if the response code is not an "unsalvageable error" like a 404
# (see {#response_error_unsalvageable} )
# - +false+ - do not raise, return +nil+
def fetch_resource_without_id_validation(uri, on_behalf_of = nil, raise_on_error: false, request_options: {})
on_behalf_of ||= Account.representative
build_request(uri, on_behalf_of, options: request_options).perform do |response|
raise Mastodon::UnexpectedResponseError, response unless response_successful?(response) || response_error_unsalvageable?(response) || !raise_on_temporary_error
raise Mastodon::UnexpectedResponseError, response if !response_successful?(response) && (
[true, :all].include?(raise_on_error) ||
(!response_error_unsalvageable?(response) && raise_on_error == :temporary)
)
body_to_json(response.body_with_limit) if response.code == 200 && valid_activitypub_content_type?(response)
end

View file

@ -0,0 +1,53 @@
# frozen_string_literal: true
module Status::FetchRepliesConcern
extend ActiveSupport::Concern
# enable/disable fetching all replies
FETCH_REPLIES_ENABLED = ENV.key?('FETCH_REPLIES_ENABLED') ? ENV['FETCH_REPLIES_ENABLED'] == 'true' : true
# debounce fetching all replies to minimize DoS
FETCH_REPLIES_COOLDOWN_MINUTES = (ENV['FETCH_REPLIES_COOLDOWN_MINUTES'] || 15).to_i.minutes
FETCH_REPLIES_INITIAL_WAIT_MINUTES = (ENV['FETCH_REPLIES_INITIAL_WAIT_MINUTES'] || 5).to_i.minutes
included do
scope :created_recently, -> { where(created_at: FETCH_REPLIES_INITIAL_WAIT_MINUTES.ago..) }
scope :not_created_recently, -> { where(created_at: ..FETCH_REPLIES_INITIAL_WAIT_MINUTES.ago) }
scope :fetched_recently, -> { where(fetched_replies_at: FETCH_REPLIES_COOLDOWN_MINUTES.ago..) }
scope :not_fetched_recently, -> { where(fetched_replies_at: [nil, ..FETCH_REPLIES_COOLDOWN_MINUTES.ago]) }
scope :should_not_fetch_replies, -> { local.or(created_recently.or(fetched_recently)) }
scope :should_fetch_replies, -> { remote.not_created_recently.not_fetched_recently }
# statuses for which we won't receive update or deletion actions,
# and should update when fetching replies
# Status from an account which either
# a) has only remote followers
# b) has local follows that were created after the last update time, or
# c) has no known followers
scope :unsubscribed, lambda {
remote.merge(
Status.left_outer_joins(account: :followers).where.not(followers_accounts: { domain: nil })
.or(where.not('follows.created_at < statuses.updated_at'))
.or(where(follows: { id: nil }))
)
}
end
def should_fetch_replies?
# we aren't brand new, and we haven't fetched replies since the debounce window
FETCH_REPLIES_ENABLED && !local? && created_at <= FETCH_REPLIES_INITIAL_WAIT_MINUTES.ago && (
fetched_replies_at.nil? || fetched_replies_at <= FETCH_REPLIES_COOLDOWN_MINUTES.ago
)
end
def unsubscribed?
return false if local?
!Follow.joins(:account).exists?(
target_account: account.id,
account: { domain: nil },
created_at: ..updated_at
)
end
end

View file

@ -27,6 +27,7 @@
# edited_at :datetime
# trendable :boolean
# ordered_media_attachment_ids :bigint(8) is an Array
# fetched_replies_at :datetime
#
class Status < ApplicationRecord
@ -34,6 +35,7 @@ class Status < ApplicationRecord
include Discard::Model
include Paginable
include RateLimitable
include Status::FetchRepliesConcern
include Status::SafeReblogInsert
include Status::SearchConcern
include Status::SnapshotConcern

View file

@ -0,0 +1,68 @@
# frozen_string_literal: true
class ActivityPub::FetchAllRepliesService < ActivityPub::FetchRepliesService
include JsonLdHelper
# Limit of replies to fetch per status
MAX_REPLIES = (ENV['FETCH_REPLIES_MAX_SINGLE'] || 500).to_i
def call(collection_or_uri, status_uri, max_pages = nil, request_id: nil)
@allow_synchronous_requests = true
@collection_or_uri = collection_or_uri
@status_uri = status_uri
@items, n_pages = collection_items(collection_or_uri, max_pages)
@items = filtered_replies
return if @items.nil?
FetchReplyWorker.push_bulk(@items) { |reply_uri| [reply_uri, { 'request_id' => request_id }] }
[@items, n_pages]
end
private
def filtered_replies
return if @items.nil?
# Find all statuses that we *shouldn't* update the replies for, and use that as a filter.
# We don't assume that we have the statuses before they're created,
# hence the negative filter -
# "keep all these uris except the ones we already have"
# instead of
# "keep all these uris that match some conditions on existing Status objects"
#
# Typically we assume the number of replies we *shouldn't* fetch is smaller than the
# replies we *should* fetch, so we also minimize the number of uris we should load here.
uris = @items.map { |item| value_or_id(item) }
# Expand collection to get replies in the DB that were
# - not included in the collection,
# - that we have locally
# - but we have no local followers and thus don't get updates/deletes for
parent_id = Status.where(uri: @status_uri).pick(:id)
unless parent_id.nil?
unsubscribed_replies = Status
.where.not(uri: uris)
.where(in_reply_to_id: parent_id)
.unsubscribed
.pluck(:uri)
uris.concat(unsubscribed_replies)
end
dont_update = Status.where(uri: uris).should_not_fetch_replies.pluck(:uri)
# touch all statuses that already exist and that we're about to update
Status.where(uri: uris).should_fetch_replies.touch_all(:fetched_replies_at)
# Reject all statuses that we already have in the db
uris = (uris - dont_update).take(MAX_REPLIES)
Rails.logger.debug { "FetchAllRepliesService - #{@collection_or_uri}: Fetching filtered statuses: #{uris}" }
uris
end
def filter_by_host?
false
end
end

View file

@ -33,7 +33,7 @@ class ActivityPub::FetchFeaturedCollectionService < BaseService
return collection_or_uri if collection_or_uri.is_a?(Hash)
return if non_matching_uri_hosts?(@account.uri, collection_or_uri)
fetch_resource_without_id_validation(collection_or_uri, local_follower, true)
fetch_resource_without_id_validation(collection_or_uri, local_follower, raise_on_error: :temporary)
end
def process_items(items)

View file

@ -45,7 +45,7 @@ class ActivityPub::FetchFeaturedTagsCollectionService < BaseService
return collection_or_uri if collection_or_uri.is_a?(Hash)
return if non_matching_uri_hosts?(@account.uri, collection_or_uri)
fetch_resource_without_id_validation(collection_or_uri, local_follower, true)
fetch_resource_without_id_validation(collection_or_uri, local_follower, raise_on_error: :temporary)
end
def process_items(items)

View file

@ -13,7 +13,7 @@ class ActivityPub::FetchRemoteStatusService < BaseService
@request_id = request_id || "#{Time.now.utc.to_i}-status-#{uri}"
@json = if prefetched_body.nil?
fetch_resource(uri, true, on_behalf_of)
fetch_status(uri, true, on_behalf_of)
else
body_to_json(prefetched_body, compare_id: uri)
end
@ -80,4 +80,20 @@ class ActivityPub::FetchRemoteStatusService < BaseService
def expected_object_type?
equals_or_includes_any?(@json['type'], ActivityPub::Activity::Create::SUPPORTED_TYPES + ActivityPub::Activity::Create::CONVERTED_TYPES)
end
def fetch_status(uri, id_is_known, on_behalf_of = nil)
begin
fetch_resource(uri, id_is_known, on_behalf_of, raise_on_error: true)
rescue Mastodon::UnexpectedResponseError => e
return unless e.response.code == 404
# If this is a 404 from a status from an account that has no local followers, delete it
existing_status = Status.find_by(uri: uri)
if !existing_status.nil? && existing_status.unsubscribed? && existing_status.distributable?
Rails.logger.debug { "FetchRemoteStatusService - Got 404 for orphaned status with URI #{uri}, deleting" }
Tombstone.find_or_create_by(uri: uri, account: existing_status.account)
RemoveStatusService.new.call(existing_status, redraft: false)
end
end
end
end

View file

@ -3,11 +3,14 @@
class ActivityPub::FetchRepliesService < BaseService
include JsonLdHelper
# Limit of fetched replies
MAX_REPLIES = 5
def call(parent_status, collection_or_uri, allow_synchronous_requests: true, request_id: nil)
@account = parent_status.account
@allow_synchronous_requests = allow_synchronous_requests
@items = collection_items(collection_or_uri)
@items, = collection_items(collection_or_uri)
return if @items.nil?
FetchReplyWorker.push_bulk(filtered_replies) { |reply_uri| [reply_uri, { 'request_id' => request_id }] }
@ -17,25 +20,39 @@ class ActivityPub::FetchRepliesService < BaseService
private
def collection_items(collection_or_uri)
def collection_items(collection_or_uri, max_pages = nil)
collection = fetch_collection(collection_or_uri)
return unless collection.is_a?(Hash)
collection = fetch_collection(collection['first']) if collection['first'].present?
return unless collection.is_a?(Hash)
case collection['type']
when 'Collection', 'CollectionPage'
as_array(collection['items'])
when 'OrderedCollection', 'OrderedCollectionPage'
as_array(collection['orderedItems'])
all_items = []
n_pages = 1
while collection.is_a?(Hash)
items = case collection['type']
when 'Collection', 'CollectionPage'
collection['items']
when 'OrderedCollection', 'OrderedCollectionPage'
collection['orderedItems']
end
all_items.concat(as_array(items))
break if all_items.size >= MAX_REPLIES
break if !max_pages.nil? && n_pages >= max_pages
collection = collection['next'].present? ? fetch_collection(collection['next']) : nil
n_pages += 1
end
[all_items, n_pages]
end
def fetch_collection(collection_or_uri)
return collection_or_uri if collection_or_uri.is_a?(Hash)
return unless @allow_synchronous_requests
return if non_matching_uri_hosts?(@account.uri, collection_or_uri)
return if filter_by_host? && non_matching_uri_hosts?(@account.uri, collection_or_uri)
# NOTE: For backward compatibility reasons, Mastodon signs outgoing
# queries incorrectly by default.
@ -45,19 +62,28 @@ class ActivityPub::FetchRepliesService < BaseService
#
# Therefore, retry with correct signatures if this fails.
begin
fetch_resource_without_id_validation(collection_or_uri, nil, true)
fetch_resource_without_id_validation(collection_or_uri, nil, raise_on_error: :temporary)
rescue Mastodon::UnexpectedResponseError => e
raise unless e.response && e.response.code == 401 && Addressable::URI.parse(collection_or_uri).query.present?
fetch_resource_without_id_validation(collection_or_uri, nil, true, request_options: { omit_query_string: false })
fetch_resource_without_id_validation(collection_or_uri, nil, raise_on_error: :temporary, request_options: { omit_query_string: false })
end
end
def filtered_replies
# Only fetch replies to the same server as the original status to avoid
# amplification attacks.
if filter_by_host?
# Only fetch replies to the same server as the original status to avoid
# amplification attacks.
# Also limit to 5 fetched replies to limit potential for DoS.
@items.map { |item| value_or_id(item) }.reject { |uri| non_matching_uri_hosts?(@account.uri, uri) }.take(5)
# Also limit to 5 fetched replies to limit potential for DoS.
@items.map { |item| value_or_id(item) }.reject { |uri| non_matching_uri_hosts?(@account.uri, uri) }.take(MAX_REPLIES)
else
@items.map { |item| value_or_id(item) }.take(MAX_REPLIES)
end
end
# Whether replies with a different domain than the replied_to post should be rejected
def filter_by_host?
true
end
end

View file

@ -69,6 +69,6 @@ class ActivityPub::SynchronizeFollowersService < BaseService
return collection_or_uri if collection_or_uri.is_a?(Hash)
return if non_matching_uri_hosts?(@account.uri, collection_or_uri)
fetch_resource_without_id_validation(collection_or_uri, nil, true)
fetch_resource_without_id_validation(collection_or_uri, nil, raise_on_error: :temporary)
end
end

View file

@ -0,0 +1,77 @@
# frozen_string_literal: true
# Fetch all replies to a status, querying recursively through
# ActivityPub replies collections, fetching any statuses that
# we either don't already have or we haven't checked for new replies
# in the Status::FETCH_REPLIES_COOLDOWN_MINUTES interval
class ActivityPub::FetchAllRepliesWorker
include Sidekiq::Worker
include ExponentialBackoff
include JsonLdHelper
sidekiq_options queue: 'pull', retry: 3
# Global max replies to fetch per request (all replies, recursively)
MAX_REPLIES = (ENV['FETCH_REPLIES_MAX_GLOBAL'] || 1000).to_i
MAX_PAGES = (ENV['FETCH_REPLIES_MAX_PAGES'] || 500).to_i
def perform(parent_status_id, options = {})
@parent_status = Status.find(parent_status_id)
return unless @parent_status.should_fetch_replies?
@parent_status.touch(:fetched_replies_at)
Rails.logger.debug { "FetchAllRepliesWorker - #{@parent_status.uri}: Fetching all replies for status: #{@parent_status}" }
uris_to_fetch, n_pages = get_replies(@parent_status.uri, MAX_PAGES, options)
return if uris_to_fetch.nil?
fetched_uris = uris_to_fetch.clone.to_set
until uris_to_fetch.empty? || fetched_uris.length >= MAX_REPLIES || n_pages >= MAX_PAGES
next_reply = uris_to_fetch.pop
next if next_reply.nil?
new_reply_uris, new_n_pages = get_replies(next_reply, MAX_PAGES - n_pages, options)
next if new_reply_uris.nil?
new_reply_uris = new_reply_uris.reject { |uri| fetched_uris.include?(uri) }
uris_to_fetch.concat(new_reply_uris)
fetched_uris = fetched_uris.merge(new_reply_uris)
n_pages += new_n_pages
end
Rails.logger.debug { "FetchAllRepliesWorker - #{parent_status_id}: fetched #{fetched_uris.length} replies" }
fetched_uris
end
private
def get_replies(status_uri, max_pages, options = {})
replies_collection_or_uri = get_replies_uri(status_uri)
return if replies_collection_or_uri.nil?
ActivityPub::FetchAllRepliesService.new.call(replies_collection_or_uri, status_uri, max_pages, **options.deep_symbolize_keys)
end
def get_replies_uri(parent_status_uri)
begin
json_status = fetch_resource(parent_status_uri, true)
if json_status.nil?
Rails.logger.debug { "FetchAllRepliesWorker - #{@parent_status.uri}: Could not get replies URI for #{parent_status_uri}, returned nil" }
nil
elsif !json_status.key?('replies')
Rails.logger.debug { "FetchAllRepliesWorker - #{@parent_status.uri}: No replies collection found in ActivityPub object: #{json_status}" }
nil
else
json_status['replies']
end
rescue => e
Rails.logger.error { "FetchAllRepliesWorker - #{@parent_status.uri}: Caught exception while resolving replies URI #{parent_status_uri}: #{e} - #{e.message}" }
# Raise if we can't get the collection for top-level status to trigger retry
raise e if parent_status_uri == @parent_status.uri
nil
end
end
end