refactor: html parsing

2023-01-07 10:31:48 +01:00 · 2023-01-07 10:31:48 +01:00 · 6944a74653
commit 6944a74653
parent d76e4bfaa5
8 changed files with 152 additions and 124 deletions
--- a/components/content/ContentRich.setup.ts
+++ b/components/content/ContentRich.setup.ts
@ -4,27 +4,23 @@ defineOptions({
  name: 'ContentRich',
 })

-const { content, emojis, markdown = true } = defineProps<{
+const {
+  content,
+  emojis,
+  markdown = true,
+} = defineProps<{
  content: string
-  markdown?: boolean
  emojis?: Emoji[]
+  markdown?: boolean
 }>()

-const useEmojis = computed(() => {
-  const result: Emoji[] = []
-  if (emojis)
-    result.push(...emojis)
-
-  result.push(...currentCustomEmojis.value.emojis)
-
-  return emojisArrayToObject(result)
-})
+const emojisObject = useEmojisFallback(() => emojis)

 export default () => h(
  'span',
  { class: 'content-rich', dir: 'auto' },
  contentToVNode(content, {
-    emojis: useEmojis.value,
+    emojis: emojisObject.value,
    markdown,
  }),
 )
--- a/components/status/StatusBody.vue
+++ b/components/status/StatusBody.vue
@ -1,23 +1,37 @@
 <script setup lang="ts">
 import type { Status, StatusEdit } from 'masto'

-const { status, withAction = true } = defineProps<{
+const {
+  status,
+  withAction = true,
+} = defineProps<{
  status: Status | StatusEdit
  withAction?: boolean
 }>()

 const { translation } = useTranslation(status)
+
+const emojisObject = useEmojisFallback(() => status.emojis)
+const vnode = $computed(() => {
+  if (!status.content)
+    return null
+  const vnode = contentToVNode(status.content, {
+    emojis: emojisObject.value,
+    markdown: true,
+  })
+  return vnode
+})
 </script>

 <template>
  <div class="status-body" whitespace-pre-wrap break-words :class="{ 'with-action': withAction }">
-    <ContentRich
+    <span
      v-if="status.content"
-      class="line-compact"
-      :content="status.content"
-      :emojis="status.emojis"
-      :lang="'language' in status && status.language"
-    />
+      class="content-rich line-compact" dir="auto"
+      :lang="('language' in status && status.language) || undefined"
+    >
+      <component :is="vnode" />
+    </span>
    <div v-else />
    <template v-if="translation.visible">
      <div my2 h-px border="b base" bg-base />
--- a/composables/content-parse.ts
+++ b/composables/content-parse.ts
@ -5,6 +5,34 @@ import { ELEMENT_NODE, TEXT_NODE, h, parse, render } from 'ultrahtml'
 import { findAndReplaceEmojisInText } from '@iconify/utils'
 import { emojiRegEx, getEmojiAttributes } from '../config/emojis'

+export interface ContentParseOptions {
+  emojis?: Record<string, Emoji>
+  markdown?: boolean
+  replaceUnicodeEmoji?: boolean
+  astTransforms?: Transform[]
+}
+
+const sanitizerBasicClasses = filterClasses(/^(h-\S*|p-\S*|u-\S*|dt-\S*|e-\S*|mention|hashtag|ellipsis|invisible)$/u)
+const sanitizer = sanitize({
+  // Allow basic elements as seen in https://github.com/mastodon/mastodon/blob/17f79082b098e05b68d6f0d38fabb3ac121879a9/lib/sanitize_ext/sanitize_config.rb
+  br: {},
+  p: {},
+  a: {
+    href: filterHref(),
+    class: sanitizerBasicClasses,
+    rel: set('nofollow noopener noreferrer'),
+    target: set('_blank'),
+  },
+  span: {
+    class: sanitizerBasicClasses,
+  },
+  // Allow elements potentially created for Markdown code blocks above
+  pre: {},
+  code: {
+    class: filterClasses(/^language-\w+$/),
+  },
+})
+
 const decoder = process.client ? document.createElement('textarea') : null
 export function decodeHtml(text: string) {
  if (!decoder)
@ -18,11 +46,19 @@ export function decodeHtml(text: string) {
 * Parse raw HTML form Mastodon server to AST,
 * with interop of custom emojis and inline Markdown syntax
 */
-export function parseMastodonHTML(html: string, customEmojis: Record<string, Emoji> = {}, markdown = true, forTiptap = false) {
+export function parseMastodonHTML(
+  html: string,
+  options: ContentParseOptions = {},
+) {
+  const {
+    markdown = true,
+    replaceUnicodeEmoji = true,
+  } = options
+
  if (markdown) {
    // Handle code blocks
    html = html
-      .replace(/>(```|~~~)(\w*)([\s\S]+?)\1/g, (_1, _2, lang, raw) => {
+      .replace(/>(```|~~~)(\w*)([\s\S]+?)\1/g, (_1, _2, lang: string, raw: string) => {
        const code = htmlToText(raw)
        const classes = lang ? ` class="language-${lang}"` : ''
        return `><pre><code${classes}>${code}</code></pre>`
@ -30,39 +66,31 @@ export function parseMastodonHTML(html: string, customEmojis: Record<string, Emo
  }

  // Always sanitize the raw HTML data *after* it has been modified
-  const basicClasses = filterClasses(/^(h-\S*|p-\S*|u-\S*|dt-\S*|e-\S*|mention|hashtag|ellipsis|invisible)$/u)
-  return transformSync(parse(html), [
-    sanitize({
-      // Allow basic elements as seen in https://github.com/mastodon/mastodon/blob/17f79082b098e05b68d6f0d38fabb3ac121879a9/lib/sanitize_ext/sanitize_config.rb
-      br: {},
-      p: {},
-      a: {
-        href: filterHref(),
-        class: basicClasses,
-        rel: set('nofollow noopener noreferrer'),
-        target: set('_blank'),
-      },
-      span: {
-        class: basicClasses,
-      },
-      // Allow elements potentially created for Markdown code blocks above
-      pre: {},
-      code: {
-        class: filterClasses(/^language-\w+$/),
-      },
-    }),
-    // Unicode emojis to images, but only if not converting HTML for Tiptap
-    !forTiptap ? replaceUnicodeEmoji() : noopTransform(),
-    markdown ? formatMarkdown() : noopTransform(),
-    replaceCustomEmoji(customEmojis),
-  ])
+  const transforms: Transform[] = [
+    sanitizer,
+    ...options.astTransforms || [],
+  ]
+
+  if (replaceUnicodeEmoji)
+    transforms.push(transformUnicodeEmoji)
+
+  if (markdown)
+    transforms.push(transformMarkdown)
+
+  transforms.push(replaceCustomEmoji(options.emojis || {}))
+
+  return transformSync(parse(html), transforms)
 }

 /**
 * Converts raw HTML form Mastodon server to HTML for Tiptap editor
 */
 export function convertMastodonHTML(html: string, customEmojis: Record<string, Emoji> = {}) {
-  const tree = parseMastodonHTML(html, customEmojis, true, true)
+  const tree = parseMastodonHTML(html, {
+    emojis: customEmojis,
+    markdown: true,
+    replaceUnicodeEmoji: false,
+  })
  return render(tree)
 }

@ -162,11 +190,6 @@ function transformSync(doc: Node, transforms: Transform[]) {
  return doc
 }

-// A transformation that does nothing. Useful for conditional transform chains.
-function noopTransform(): Transform {
-  return node => node
-}
-
 // A tree transform for sanitizing elements & their attributes.
 type AttrSanitizers = Record<string, (value: string | undefined) => string | undefined>
 function sanitize(allowedElements: Record<string, AttrSanitizers>): Transform {
@ -241,27 +264,25 @@ function filterHref() {
  }
 }

-function replaceUnicodeEmoji(): Transform {
-  return (node) => {
-    if (node.type !== TEXT_NODE)
-      return node
+function transformUnicodeEmoji(node: Node) {
+  if (node.type !== TEXT_NODE)
+    return node

-    let start = 0
+  let start = 0

-    const matches = [] as (string | Node)[]
-    findAndReplaceEmojisInText(emojiRegEx, node.value, (match, result) => {
-      const attrs = getEmojiAttributes(match)
-      matches.push(result.slice(start))
-      matches.push(h('img', { src: attrs.src, alt: attrs.alt, class: attrs.class }))
-      start = result.length + match.match.length
-      return undefined
-    })
-    if (matches.length === 0)
-      return node
+  const matches = [] as (string | Node)[]
+  findAndReplaceEmojisInText(emojiRegEx, node.value, (match, result) => {
+    const attrs = getEmojiAttributes(match)
+    matches.push(result.slice(start))
+    matches.push(h('img', { src: attrs.src, alt: attrs.alt, class: attrs.class }))
+    start = result.length + match.match.length
+    return undefined
+  })
+  if (matches.length === 0)
+    return node

-    matches.push(node.value.slice(start))
-    return matches.filter(Boolean)
-  }
+  matches.push(node.value.slice(start))
+  return matches.filter(Boolean)
 }

 function replaceCustomEmoji(customEmojis: Record<string, Emoji>): Transform {
@ -286,47 +307,45 @@ function replaceCustomEmoji(customEmojis: Record<string, Emoji>): Transform {
  }
 }

-function formatMarkdown(): Transform {
-  const replacements: [RegExp, (c: (string | Node)[]) => Node][] = [
-    [/\*\*\*(.*?)\*\*\*/g, c => h('b', null, [h('em', null, c)])],
-    [/\*\*(.*?)\*\*/g, c => h('b', null, c)],
-    [/\*(.*?)\*/g, c => h('em', null, c)],
-    [/~~(.*?)~~/g, c => h('del', null, c)],
-    [/`([^`]+?)`/g, c => h('code', null, c)],
-  ]
+const _markdownReplacements: [RegExp, (c: (string | Node)[]) => Node][] = [
+  [/\*\*\*(.*?)\*\*\*/g, c => h('b', null, [h('em', null, c)])],
+  [/\*\*(.*?)\*\*/g, c => h('b', null, c)],
+  [/\*(.*?)\*/g, c => h('em', null, c)],
+  [/~~(.*?)~~/g, c => h('del', null, c)],
+  [/`([^`]+?)`/g, c => h('code', null, c)],
+]

-  function process(value: string) {
-    const results = [] as (string | Node)[]
+function _markdownProcess(value: string) {
+  const results = [] as (string | Node)[]

-    let start = 0
-    while (true) {
-      let found: { match: RegExpMatchArray; replacer: (c: (string | Node)[]) => Node } | undefined
+  let start = 0
+  while (true) {
+    let found: { match: RegExpMatchArray; replacer: (c: (string | Node)[]) => Node } | undefined

-      for (const [re, replacer] of replacements) {
-        re.lastIndex = start
+    for (const [re, replacer] of _markdownReplacements) {
+      re.lastIndex = start

-        const match = re.exec(value)
-        if (match) {
-          if (!found || match.index < found.match.index!)
-            found = { match, replacer }
-        }
+      const match = re.exec(value)
+      if (match) {
+        if (!found || match.index < found.match.index!)
+          found = { match, replacer }
      }
-
-      if (!found)
-        break
-
-      results.push(value.slice(start, found.match.index))
-      results.push(found.replacer(process(found.match[1])))
-      start = found.match.index! + found.match[0].length
    }

-    results.push(value.slice(start))
-    return results.filter(Boolean)
+    if (!found)
+      break
+
+    results.push(value.slice(start, found.match.index))
+    results.push(found.replacer(_markdownProcess(found.match[1])))
+    start = found.match.index! + found.match[0].length
  }

-  return (node) => {
-    if (node.type !== TEXT_NODE)
-      return node
-    return process(node.value)
-  }
+  results.push(value.slice(start))
+  return results.filter(Boolean)
+}
+
+function transformMarkdown(node: Node) {
+  if (node.type !== TEXT_NODE)
+    return node
+  return _markdownProcess(node.value)
 }
--- a/composables/content-render.ts
+++ b/composables/content-render.ts
@ -1,9 +1,9 @@
-import type { Emoji } from 'masto'
 import { TEXT_NODE } from 'ultrahtml'
 import type { Node } from 'ultrahtml'
 import { Fragment, h, isVNode } from 'vue'
 import type { VNode } from 'vue'
 import { RouterLink } from 'vue-router'
+import type { ContentParseOptions } from './content-parse'
 import { decodeHtml, parseMastodonHTML } from './content-parse'
 import ContentCode from '~/components/content/ContentCode.vue'
 import AccountHoverWrapper from '~/components/account/AccountHoverWrapper.vue'
@ -13,12 +13,9 @@ import AccountHoverWrapper from '~/components/account/AccountHoverWrapper.vue'
 */
 export function contentToVNode(
  content: string,
-  { emojis = {}, markdown = true }: {
-    emojis?: Record<string, Emoji>
-    markdown?: boolean
-  } = {},
+  options?: ContentParseOptions,
 ): VNode {
-  const tree = parseMastodonHTML(content, emojis, markdown)
+  const tree = parseMastodonHTML(content, options)
  return h(Fragment, (tree.children as Node[]).map(n => treeToVNode(n)))
 }

--- a/composables/emojis.ts
+++ b/composables/emojis.ts
@ -51,3 +51,16 @@ export const customEmojisData = computed(() => currentCustomEmojis.value.emojis.
      emojis: transformEmojiData(currentCustomEmojis.value.emojis),
    }]
  : undefined)
+
+export function useEmojisFallback(emojisGetter: () => Emoji[] | undefined) {
+  return computed(() => {
+    const result: Emoji[] = []
+    const emojis = emojisGetter()
+    if (emojis)
+      result.push(...emojis)
+
+    result.push(...currentCustomEmojis.value.emojis)
+
+    return emojisArrayToObject(result)
+  })
+}
--- a/config/emojis.ts
+++ b/config/emojis.ts
@ -1,3 +1,4 @@
+// @unimport-disabled
 import { emojiFilename, emojiPrefix, emojiRegEx } from '@iconify-emoji/twemoji'
 import type { EmojiRegexMatch } from '@iconify/utils/lib/emoji/replace/find'
 import { getEmojiMatchesInText } from '@iconify/utils/lib/emoji/replace/find'
--- a/tests/snapshots/content-rich.test.ts.snap
+++ b/tests/snapshots/content-rich.test.ts.snap
@ -1,12 +1,5 @@
 // Vitest Snapshot v1

-exports[`content-rich > JavaScript hrefs get removed 1`] = `
-"<p>
-  <a href=\\"#\\" rel=\\"nofollow noopener noreferrer\\" target=\\"_blank\\">click me</a>
-</p>
-"
-`;
-
 exports[`content-rich > code frame 1`] = `
 "<p>Testing code block</p><p><pre lang=\\"ts\\">import { useMouse, usePreferredDark } from &#39;@vueuse/core&#39;
 // tracks mouse position
@ -75,8 +68,3 @@ exports[`content-rich > link + mention 1`] = `
 </p>
 "
 `;
-
-exports[`content-rich > script tags get removed 1`] = `
-"<p></p>
-"
-`;
--- a/tests/html-parse.test.ts
+++ b/tests/html-parse.test.ts
@ -67,7 +67,7 @@ describe('html-parse', () => {
 })

 async function render(input: string, emojis?: Record<string, Emoji>) {
-  const tree = parseMastodonHTML(input, emojis)
+  const tree = parseMastodonHTML(input, { emojis })
  const html = await renderTree(tree)
  let formatted = ''
  const serializedText = treeToText(tree).trim()