whippy-edition/app/services/process_feed_service.rb

class ProcessFeedService < BaseService
  ACTIVITY_NS = 'http://activitystrea.ms/spec/1.0/'.freeze
  THREAD_NS   = 'http://purl.org/syndication/thread/1.0'.freeze

  def call(body, account)
    xml = Nokogiri::XML(body)

    update_author(xml, account)
    process_entries(xml, account)
  end

  private

  def update_author(xml, account)
    return if xml.at_xpath('/xmlns:feed').nil?
    UpdateRemoteProfileService.new.call(xml.at_xpath('/xmlns:feed/xmlns:author'), account)
  end

  def process_entries(xml, account)
    xml.xpath('//xmlns:entry').reverse_each.map { |entry| ProcessEntry.new.call(entry, account) }.compact
  end

  class ProcessEntry
    def call(xml, account)
      @account = account
      @xml     = xml

      return if skip_unsupported_type?

      case verb
      when :post, :share
        return create_status
      when :delete
        return delete_status
      end
    end

    private

    def create_status
      Rails.logger.debug "Creating remote status #{id}"
      status = status_from_xml(@xml)

      if verb == :share
        original_status = status_from_xml(xml.at_xpath('.//activity:object', activity: ACTIVITY_NS))
        status.reblog   = original_status
      end

      status.save!
      Rails.logger.debug "Queuing remote status #{status.id} (#{id}) for distribution"
      DistributionWorker.perform_async(status.id)
      status
    end

    def delete_status
      Rails.logger.debug "Deleting remote status #{id}"
      status = Status.find_by(uri: id)
      RemoveStatusService.new.call(status) unless status.nil?
      nil
    end

    def skip_unsupported_type?
      !([:post, :share, :delete].include?(verb) && [:activity, :note, :comment].include?(type))
    end

    def status_from_xml(entry)
      # Return early if status already exists in db
      status = find_status(id(entry))
      return status unless status.nil?

      status = Status.create!({
        uri: id(entry),
        url: url(entry),
        account: account?(entry) ? find_or_resolve_account(acct(entry)) : @account,
        text: content(entry),
        created_at: published(entry),
      })

      if thread?(entry)
        status.thread = find_or_resolve_status(status, *thread(entry))
      end

      mentions_from_xml(status, entry)
      hashtags_from_xml(status, entry)
      media_from_xml(status, entry)

      status
    end

    def find_or_resolve_account(acct)
      FollowRemoteAccountService.new.call(acct)
    end

    def find_or_resolve_status(parent, uri, url)
      status = find_status(uri)
      ThreadResolveWorker.perform_async(parent.id, url) if status.nil?

      status
    end

    def find_status(uri)
      if TagManager.instance.local_id?(uri)
        local_id = TagManager.instance.unique_tag_to_local_id(uri, 'Status')
        return Status.find(local_id)
      end

      Status.find_by(uri: uri)
    end

    def mentions_from_xml(parent, xml)
      processed_account_ids = []

      xml.xpath('./xmlns:link[@rel="mentioned"]').each do |link|
        next if link['href'] == 'http://activityschema.org/collection/public'

        url = Addressable::URI.parse(link['href'])

        mentioned_account = if TagManager.instance.local_domain?(url.host)
                              Account.find_local(url.path.gsub('/users/', ''))
                            else
                              Account.find_by(url: link['href']) || FetchRemoteAccountService.new.call(link['href'])
                            end

        next if mentioned_account.nil? || processed_account_ids.include?(mentioned_account.id)

        if mentioned_account.local?
          # Send notifications
          NotificationMailer.mention(mentioned_account, parent).deliver_later unless mentioned_account.blocking?(parent.account)
        end

        mentioned_account.mentions.where(status: parent).first_or_create(status: parent)

        # So we can skip duplicate mentions
        processed_account_ids << mentioned_account.id
      end
    end

    def hashtags_from_xml(parent, xml)
      tags = xml.xpath('./xmlns:category').map { |category| category['term'] }
      ProcessHashtagsService.new.call(parent, tags)
    end

    def media_from_xml(parent, xml)
      xml.xpath('./xmlns:link[@rel="enclosure"]').each do |link|
        next unless link['href']

        media = MediaAttachment.where(status: parent, remote_url: link['href']).first_or_initialize(account: parent.account, status: parent, remote_url: link['href'])

        begin
          media.file_remote_url = link['href']
          media.save
        rescue Paperclip::Errors::NotIdentifiedByImageMagickError
          next
        end
      end
    end

    def id(xml = @xml)
      xml.at_xpath('./xmlns:id').content
    end

    def verb(xml = @xml)
      raw = xml.at_xpath('./activity:verb', activity: ACTIVITY_NS).content
      raw.gsub('http://activitystrea.ms/schema/1.0/', '').gsub('http://ostatus.org/schema/1.0/', '').to_sym
    rescue
      :post
    end

    def type(xml = @xml)
      raw = xml.at_xpath('./activity:object-type', activity: ACTIVITY_NS).content
      raw.gsub('http://activitystrea.ms/schema/1.0/', '').gsub('http://ostatus.org/schema/1.0/', '').to_sym
    rescue
      :activity
    end

    def url(xml = @xml)
      link = xml.at_xpath('./xmlns:link[@rel="alternate"]')
      link['href']
    end

    def content(xml = @xml)
      xml.at_xpath('./xmlns:content').content
    end

    def published(xml = @xml)
      xml.at_xpath('./xmlns:published').content
    end

    def thread?(xml = @xml)
      !xml.at_xpath('./thr:in-reply-to', thr: THREAD_NS).nil?
    end

    def thread(xml = @xml)
      thr = xml.at_xpath('./thr:in-reply-to', thr: THREAD_NS)
      [thr['ref'], thr['href']]
    end

    def account?(xml = @xml)
      !xml.at_xpath('./xmlns:author').nil?
    end

    def acct(xml = @xml)
      username = xml.at_xpath('./xmlns:author/xmlns:name').content
      url      = xml.at_xpath('./xmlns:author/xmlns:uri').content
      domain   = Addressable::URI.parse(url).host

      "#{username}@#{domain}"
    end
  end
end
Send Salmon interactions 2016-02-24 20:57:29 +09:00			`class ProcessFeedService < BaseService`
Fix namespace parsing in Atom feeds 2016-10-11 01:05:52 +09:00			`ACTIVITY_NS = 'http://activitystrea.ms/spec/1.0/'.freeze`
			`THREAD_NS = 'http://purl.org/syndication/thread/1.0'.freeze`

Initial commit 2016-02-21 06:53:20 +09:00			`def call(body, account)`
			`xml = Nokogiri::XML(body)`
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00
			`update_author(xml, account)`
			`process_entries(xml, account)`
Fix subscriptions:clear task, refactor feeds, refactor streamable activites and atom feed generation to some extent, as well as the way mentions are stored 2016-03-25 10:13:30 +09:00			`end`
Initial commit 2016-02-21 06:53:20 +09:00
Fix subscriptions:clear task, refactor feeds, refactor streamable activites and atom feed generation to some extent, as well as the way mentions are stored 2016-03-25 10:13:30 +09:00			`private`
Update profile information and download avatar of remote accounts 2016-02-28 22:26:26 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def update_author(xml, account)`
			`return if xml.at_xpath('/xmlns:feed').nil?`
			`UpdateRemoteProfileService.new.call(xml.at_xpath('/xmlns:feed/xmlns:author'), account)`
			`end`
Feed processing except fetching remote statuses 2016-02-24 09:28:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def process_entries(xml, account)`
			`xml.xpath('//xmlns:entry').reverse_each.map { \|entry\| ProcessEntry.new.call(entry, account) }.compact`
			`end`
Respect "delete" verb on remote feeds 2016-03-16 18:46:15 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`class ProcessEntry`
			`def call(xml, account)`
			`@account = account`
			`@xml = xml`
Feed processing except fetching remote statuses 2016-02-24 09:28:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`return if skip_unsupported_type?`
Feed processing except fetching remote statuses 2016-02-24 09:28:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`case verb`
			`when :post, :share`
			`return create_status`
			`when :delete`
			`return delete_status`
Feed processing except fetching remote statuses 2016-02-24 09:28:53 +09:00			`end`
Fix subscriptions:clear task, refactor feeds, refactor streamable activites and atom feed generation to some extent, as well as the way mentions are stored 2016-03-25 10:13:30 +09:00			`end`
Adding a Mention model, test stubs 2016-02-25 08:17:01 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`private`
Refactored generation of unique tags, URIs and object URLs into own classes, as well as formatting of content 2016-09-10 03:04:34 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def create_status`
			`Rails.logger.debug "Creating remote status #{id}"`
			`status = status_from_xml(@xml)`
Adding hashtags 2016-11-05 23:20:05 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`if verb == :share`
			`original_status = status_from_xml(xml.at_xpath('.//activity:object', activity: ACTIVITY_NS))`
			`status.reblog = original_status`
			`end`
Refactored generation of unique tags, URIs and object URLs into own classes, as well as formatting of content 2016-09-10 03:04:34 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`status.save!`
			`Rails.logger.debug "Queuing remote status #{status.id} (#{id}) for distribution"`
Add link to github project to footer, move FanOutOnWriteService calls to DistributionWorker. That isn't the heaviest service, yet, but gotta start somewhere 2016-03-25 11:22:26 +09:00			`DistributionWorker.perform_async(status.id)`
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`status`
Fix subscriptions:clear task, refactor feeds, refactor streamable activites and atom feed generation to some extent, as well as the way mentions are stored 2016-03-25 10:13:30 +09:00			`end`
Adding a test for ReblogService, fixing mentions for remote statuses 2016-02-29 05:22:56 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def delete_status`
			`Rails.logger.debug "Deleting remote status #{id}"`
			`status = Status.find_by(uri: id)`
			`RemoveStatusService.new.call(status) unless status.nil?`
			`nil`
			`end`
Fix duplication of media attachments when a remote status reblogs a local one 2016-10-15 03:14:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def skip_unsupported_type?`
			`!([:post, :share, :delete].include?(verb) && [:activity, :note, :comment].include?(type))`
			`end`
Adding a test for ReblogService, fixing mentions for remote statuses 2016-02-29 05:22:56 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def status_from_xml(entry)`
			`# Return early if status already exists in db`
			`status = find_status(id(entry))`
			`return status unless status.nil?`

			`status = Status.create!({`
			`uri: id(entry),`
			`url: url(entry),`
			`account: account?(entry) ? find_or_resolve_account(acct(entry)) : @account,`
			`text: content(entry),`
			`created_at: published(entry),`
			`})`

			`if thread?(entry)`
			`status.thread = find_or_resolve_status(status, *thread(entry))`
			`end`
Fix #54 - Fetch remote accounts by URL from mentions Fetching atom extracted from FetchRemoteAccountService and FetchRemoteStatusService into FetchAtomService. Mentions of the constant "http://activityschema.org/collection/public" skipped as it's not a real URL/user. 2016-09-26 23:42:38 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`mentions_from_xml(status, entry)`
			`hashtags_from_xml(status, entry)`
			`media_from_xml(status, entry)`
Improve code style 2016-09-30 04:28:21 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`status`
			`end`
Adding a test for ReblogService, fixing mentions for remote statuses 2016-02-29 05:22:56 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def find_or_resolve_account(acct)`
			`FollowRemoteAccountService.new.call(acct)`
			`end`
Use FanOutOnWriteService AFTER processing mentions 2016-03-19 08:41:29 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def find_or_resolve_status(parent, uri, url)`
			`status = find_status(uri)`
			`ThreadResolveWorker.perform_async(parent.id, url) if status.nil?`
Handle remote account mentions a little better by trying a URL lookup in the db 2016-09-23 04:10:36 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`status`
			`end`
Fix #54 - Fetch remote accounts by URL from mentions Fetching atom extracted from FetchRemoteAccountService and FetchRemoteStatusService into FetchAtomService. Mentions of the constant "http://activityschema.org/collection/public" skipped as it's not a real URL/user. 2016-09-26 23:42:38 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def find_status(uri)`
			`if TagManager.instance.local_id?(uri)`
			`local_id = TagManager.instance.unique_tag_to_local_id(uri, 'Status')`
			`return Status.find(local_id)`
Adding a test for ReblogService, fixing mentions for remote statuses 2016-02-29 05:22:56 +09:00			`end`
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00
			`Status.find_by(uri: uri)`
Feed processing except fetching remote statuses 2016-02-24 09:28:53 +09:00			`end`

ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def mentions_from_xml(parent, xml)`
			`processed_account_ids = []`
Fix duplication of media attachments when a remote status reblogs a local one 2016-10-15 03:14:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`xml.xpath('./xmlns:link[@rel="mentioned"]').each do \|link\|`
			`next if link['href'] == 'http://activityschema.org/collection/public'`
PostStatusService can attach media to status, ProcessFeedService likewise 2016-09-06 01:39:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`url = Addressable::URI.parse(link['href'])`
Fix #51 - prevent duplicate attachments for remote statuses 2016-09-23 03:42:20 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`mentioned_account = if TagManager.instance.local_domain?(url.host)`
			`Account.find_local(url.path.gsub('/users/', ''))`
			`else`
			`Account.find_by(url: link['href']) \|\| FetchRemoteAccountService.new.call(link['href'])`
			`end`
Improve code style 2016-09-30 04:28:21 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`next if mentioned_account.nil? \|\| processed_account_ids.include?(mentioned_account.id)`
PostStatusService can attach media to status, ProcessFeedService likewise 2016-09-06 01:39:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`if mentioned_account.local?`
			`# Send notifications`
			`NotificationMailer.mention(mentioned_account, parent).deliver_later unless mentioned_account.blocking?(parent.account)`
			`end`
Feed processing except fetching remote statuses 2016-02-24 09:28:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`mentioned_account.mentions.where(status: parent).first_or_create(status: parent)`
Add shared statuses to the database 2016-02-25 01:23:59 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`# So we can skip duplicate mentions`
			`processed_account_ids << mentioned_account.id`
			`end`
Adding e-mail notifications about mentions, follows, favourites and reblogs. Fixing another mention recording bug 2016-03-20 03:20:07 +09:00			`end`
Feed processing except fetching remote statuses 2016-02-24 09:28:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def hashtags_from_xml(parent, xml)`
			`tags = xml.xpath('./xmlns:category').map { \|category\| category['term'] }`
			`ProcessHashtagsService.new.call(parent, tags)`
Fix #24 - Thread resolving for remote statuses This is a big one, so let me enumerate: Accounts as well as stream entry pages now contain Link headers that reference the Atom feed and Webfinger URL for the former and Atom entry for the latter. So you only need to HEAD those resources to get that information, no need to download and parse HTML <link>s. ProcessFeedService will now queue ThreadResolveWorker for each remote status that it cannot find otherwise. Furthermore, entries are now processed in reverse order (from bottom to top) in case a newer entry references a chronologically previous one. ThreadResolveWorker uses FetchRemoteStatusService to obtain a status and attach the child status it was queued for to it. FetchRemoteStatusService looks up the URL, first with a HEAD, tests if it's an Atom feed, in which case it processes it directly. Next for Link headers to the Atom feed, in which case that is fetched and processed. Lastly if it's HTML, it is checked for <link>s to the Atom feed, and if such is found, that is fetched and processed. The account for the status is derived from author/name attribute in the XML and the hostname in the URL (domain). FollowRemoteAccountService and ProcessFeedService are used. This means that potentially threads are resolved recursively until a dead-end is encountered, however it is performed asynchronously over background jobs, so it should be ok. 2016-09-21 08:34:14 +09:00			`end`
Initial commit 2016-02-21 06:53:20 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def media_from_xml(parent, xml)`
			`xml.xpath('./xmlns:link[@rel="enclosure"]').each do \|link\|`
			`next unless link['href']`
Respect "delete" verb on remote feeds 2016-03-16 18:46:15 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`media = MediaAttachment.where(status: parent, remote_url: link['href']).first_or_initialize(account: parent.account, status: parent, remote_url: link['href'])`
Initial commit 2016-02-21 06:53:20 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`begin`
			`media.file_remote_url = link['href']`
			`media.save`
			`rescue Paperclip::Errors::NotIdentifiedByImageMagickError`
			`next`
			`end`
			`end`
Initial commit 2016-02-21 06:53:20 +09:00			`end`
Add shared statuses to the database 2016-02-25 01:23:59 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def id(xml = @xml)`
			`xml.at_xpath('./xmlns:id').content`
Add shared statuses to the database 2016-02-25 01:23:59 +09:00			`end`
Feed processing except fetching remote statuses 2016-02-24 09:28:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def verb(xml = @xml)`
			`raw = xml.at_xpath('./activity:verb', activity: ACTIVITY_NS).content`
			`raw.gsub('http://activitystrea.ms/schema/1.0/', '').gsub('http://ostatus.org/schema/1.0/', '').to_sym`
			`rescue`
			`:post`
Fix #24 - Thread resolving for remote statuses This is a big one, so let me enumerate: Accounts as well as stream entry pages now contain Link headers that reference the Atom feed and Webfinger URL for the former and Atom entry for the latter. So you only need to HEAD those resources to get that information, no need to download and parse HTML <link>s. ProcessFeedService will now queue ThreadResolveWorker for each remote status that it cannot find otherwise. Furthermore, entries are now processed in reverse order (from bottom to top) in case a newer entry references a chronologically previous one. ThreadResolveWorker uses FetchRemoteStatusService to obtain a status and attach the child status it was queued for to it. FetchRemoteStatusService looks up the URL, first with a HEAD, tests if it's an Atom feed, in which case it processes it directly. Next for Link headers to the Atom feed, in which case that is fetched and processed. Lastly if it's HTML, it is checked for <link>s to the Atom feed, and if such is found, that is fetched and processed. The account for the status is derived from author/name attribute in the XML and the hostname in the URL (domain). FollowRemoteAccountService and ProcessFeedService are used. This means that potentially threads are resolved recursively until a dead-end is encountered, however it is performed asynchronously over background jobs, so it should be ok. 2016-09-21 08:34:14 +09:00			`end`

ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def type(xml = @xml)`
			`raw = xml.at_xpath('./activity:object-type', activity: ACTIVITY_NS).content`
			`raw.gsub('http://activitystrea.ms/schema/1.0/', '').gsub('http://ostatus.org/schema/1.0/', '').to_sym`
			`rescue`
			`:activity`
			`end`
Feed processing except fetching remote statuses 2016-02-24 09:28:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def url(xml = @xml)`
			`link = xml.at_xpath('./xmlns:link[@rel="alternate"]')`
			`link['href']`
			`end`
Adding a test for ReblogService, fixing mentions for remote statuses 2016-02-29 05:22:56 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def content(xml = @xml)`
			`xml.at_xpath('./xmlns:content').content`
			`end`
Add shared statuses to the database 2016-02-25 01:23:59 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def published(xml = @xml)`
			`xml.at_xpath('./xmlns:published').content`
			`end`
Add shared statuses to the database 2016-02-25 01:23:59 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def thread?(xml = @xml)`
			`!xml.at_xpath('./thr:in-reply-to', thr: THREAD_NS).nil?`
			`end`
Feed processing except fetching remote statuses 2016-02-24 09:28:53 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def thread(xml = @xml)`
			`thr = xml.at_xpath('./thr:in-reply-to', thr: THREAD_NS)`
			`[thr['ref'], thr['href']]`
			`end`
Do not PuSH-subscribe to remote accounts when creating them for salmon 2016-02-24 11:05:40 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def account?(xml = @xml)`
			`!xml.at_xpath('./xmlns:author').nil?`
			`end`
Adding a Mention model, test stubs 2016-02-25 08:17:01 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`def acct(xml = @xml)`
			`username = xml.at_xpath('./xmlns:author/xmlns:name').content`
			`url = xml.at_xpath('./xmlns:author/xmlns:uri').content`
			`domain = Addressable::URI.parse(url).host`
Separate PuSH subscriptions from following, add mastodon:push:refresh task, respect hub.lease_seconds (fix #46) 2016-09-20 07:39:03 +09:00
ProcessFeedService refactor 2016-11-08 09:32:34 +09:00			`"#{username}@#{domain}"`
			`end`
Separate PuSH subscriptions from following, add mastodon:push:refresh task, respect hub.lease_seconds (fix #46) 2016-09-20 07:39:03 +09:00			`end`
Initial commit 2016-02-21 06:53:20 +09:00			`end`