Allow iFrames from trusted sources in MarkdownView

- Add a new Showdown extension to allow iFrames from trusted sources - Add a property in the AppModel to store the list of trusted sources - Allow iFrames through the xss filter with limited attributes so that they can be processed by the new Showdown extension - Add the new Showdown extension to the MarkdownView Issue #1383
NCEAS · Sep 20, 2024 · 27bf4c5 · 27bf4c5
1 parent eb2e381
commit 27bf4c5
Show file tree

Hide file tree

Showing 5 changed files with 195 additions and 0 deletions.
diff --git a/src/components/showdown/extensions/showdown-iframes.js b/src/components/showdown/extensions/showdown-iframes.js
@@ -0,0 +1,159 @@
+/**
+ * SHOWDOWN IFRAMES
+ *
+ * This extension filters out iframes with src attributes that
+ * are not from a trusted source
+ */
+
+/** List of trusted URL patterns */
+const TRUSTED_SOURCES = MetacatUI.appModel.get("trustedContentSources") || [];
+
+/**
+ * The sandbox to add to iframes from trusted sources. This allows the iframe
+ * some capabilities, such as running scripts and accessing it's own origin.
+ */
+const SANDBOX = `sandbox="allow-scripts allow-same-origin"`;
+
+/**
+ * Regular expression that finds all iframes in the markdown content. The regex
+ * captures the full iframe tag, the src attribute, the inner content, and the
+ * closing tag, if it exists.
+ * @type {RegExp}
+ */
+const IFRAME_REGEX =
+  /<iframe[^>]*?\bsrc="([^"]*)"[^>]*?>([\s\S]*?)(<\/iframe>)?/g;
+
+/**
+ * Function to convert URL patterns with wildcards to regex patterns.
+ * @param {string} wildcardPattern - The URL pattern with wildcards
+ * @returns {RegExp} - The regex pattern
+ */
+function patternToRegex(wildcardPattern) {
+  // Extract protocol if specified
+  let protocol = "";
+  let pattern = wildcardPattern;
+  const protocolMatch = pattern.match(/^(https?:\/\/)/);
+  if (protocolMatch) {
+    [, protocol] = protocolMatch;
+    pattern = wildcardPattern.slice(protocol.length);
+  }
+
+  // Escape special regex characters except for '*'
+  let escapedPattern = pattern.replace(/[-/\\^$+?.()|[\]{}]/g, "\\$&");
+  // Replace '*' with '.*'
+  escapedPattern = escapedPattern.replace(/\*/g, ".*");
+  // Escape the protocol
+  const escapedProtocol = protocol.replace(/[-/\\^$+?.()|[\]{}]/g, "\\$&");
+  // Build the full regex pattern
+  const regexString = `^${escapedProtocol}${escapedPattern}$`;
+
+  return new RegExp(regexString, "i"); // Case-insensitive matching
+}
+
+/**
+ * Check if a URL is valid according to the trusted sources. Trusted sources may
+ * use wildcards (*) to match multiple URLs. For example, the trusted source
+ * "https://*dataone.org/*" will match any URL that starts with "https://",
+ * contains "dataone.org", and ends with a path. The trusted source
+ * "*arcticdata.io*" will match any URL that contains "arcticdata.io". It could
+ * also include wildcards at any position, such as
+ * "*arcticdata.io/*\/something".
+ * @param {string} url - The URL to check
+ * @returns {boolean} - True if the URL is trusted, false otherwise
+ */
+function isTrustedUrl(url) {
+  if (!TRUSTED_SOURCES?.length) return false;
+
+  try {
+    const urlObj = new URL(url);
+    if (!urlObj.protocol.startsWith("http")) {
+      return false;
+    }
+  } catch (e) {
+    return false;
+  }
+
+  // Check if the URL matches any of the trusted sources
+  for (let i = 0; i < TRUSTED_SOURCES.length; i += 1) {
+    const pattern = TRUSTED_SOURCES[i];
+    const regex = patternToRegex(pattern);
+
+    if (regex.test(url)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/**
+ * Replace iFrames that are NOT from trusted sources with a link to the source
+ * URL. Make the iFrames from trusted sources secure by adding the 'sandbox'
+ * attribute, which restricts the iframe's capabilities. Remove any inner
+ * content from the iframe.
+ * @param {string} iframe - The full iframe tag
+ * @param {string} src - The src attribute of the iframe
+ * @param {string} _innerContent - The inner content of the iframe tag
+ * @param {string} closingTag - The closing iframe tag
+ * @param {number} _index - The index of the match
+ * @param {string} _markdown - The full markdown content
+ * @returns {string} - The secure iframe tag
+ */
+const secureIFrame = (
+  iframe,
+  src,
+  _innerContent,
+  closingTag,
+  _index,
+  _markdown,
+) => {
+  // Return as a link instead of an iframe if the source is not trusted
+  if (!isTrustedUrl(src)) {
+    return `<a href="${src}" target="_blank" rel="noopener noreferrer"><b>External Content</b>: ${src}</a>`;
+  }
+
+  // Find the position of the first '>' that ends the opening iframe tag
+  const openingTagEndIndex = iframe.indexOf(">");
+
+  // Add the 'sandbox' attr and strip out any inner content
+  if (openingTagEndIndex !== -1) {
+    // Extract the opening tag
+    let openingTag = iframe.slice(0, openingTagEndIndex);
+
+    // Ensure 'sandbox' attribute exists with the correct value
+    if (!/\bsandbox=/.test(openingTag)) {
+      // Add the 'sandbox' attribute
+      openingTag += ` ${SANDBOX}`;
+    } else {
+      // Update the existing 'sandbox' attribute to have the correct value
+      openingTag = openingTag.replace(/\bsandbox="[^"]*"/, SANDBOX);
+    }
+
+    // Close the opening tag
+    openingTag += ">";
+
+    let newIframe;
+    if (closingTag) {
+      // Reconstruct the iframe without inner content and include the closing tag
+      newIframe = `${openingTag}${closingTag}`;
+    } else {
+      // If there is no closing tag, self-close the iframe
+      newIframe = openingTag.replace(">", " />");
+    }
+
+    return newIframe;
+  }
+
+  // If the iframe tag is malformed and doesn't contain '>', return it as is
+  return iframe;
+};
+
+const extension = {
+  type: "output",
+  regex: IFRAME_REGEX,
+  replace: secureIFrame,
+};
+
+define(["showdown"], (showdown) => {
+  showdown.extension("showdown-iframes", () => [extension]);
+});
diff --git a/src/components/showdown/extensions/showdown-xss-filter/showdown-xss-filter.js b/src/components/showdown/extensions/showdown-xss-filter/showdown-xss-filter.js
@@ -12,6 +12,7 @@ define(['showdown', 'xss'], function (showdown, xss) {
     var options = {
       css: false,
       allowList: {
+        iframe: ["src", "width", "height", "frameborder", "allowfullscreen"],
         a: ["target", "href", "title", "class", "target"],
         abbr: ["title"],
         address: [],

diff --git a/src/js/app.js b/src/js/app.js
@@ -80,6 +80,8 @@ require.config({
       "/components/showdown/extensions/showdown-xss-filter/xss.min",
     showdownHtags:
       MetacatUI.root + "/components/showdown/extensions/showdown-htags",
+    showdownIframes:
+      MetacatUI.root + "/components/showdown/extensions/showdown-iframes",
     // woofmark - markdown editor
     woofmark: MetacatUI.root + "/components/woofmark.min",
     // drop zone creates drag and drop areas

diff --git a/src/js/models/AppModel.js b/src/js/models/AppModel.js
@@ -1731,6 +1731,27 @@ define(["jquery", "underscore", "backbone"], function ($, _, Backbone) {
            */
           feverUrl: "",
 
+          /**
+           * A list of trusted content sources from which MetacatUI can safely
+           * embed external content. This property is used to define URLs or URL
+           * patterns that are considered secure for embedding content in
+           * iframes, especially when rendering user-generated Markdown content.
+           *
+           * Each source in the list can include wildcards (`*`) to match any
+           * subdomain or path. For example, `"https://*.dataone.org/*"` matches
+           * any subdomain of `dataone.org` over HTTPS, and `"*arcticdata.io*"`
+           * matches any URL containing `arcticdata.io`.
+           *
+           * Set to an empty array or a falsy value to disable all embedded content.
+           *
+           * @type {string[]}
+           * @since 0.0.0
+           */
+          trustedContentSources: [
+            "https://www.youtube.com/embed/*",
+            "https://player.vimeo.com/video/*",
+          ],
+
           /** If true, then archived content is available in the search index.
            * Set to false if this MetacatUI is using a Metacat version before 2.10.0
            * @type {boolean}

diff --git a/src/js/views/MarkdownView.js b/src/js/views/MarkdownView.js
@@ -179,6 +179,7 @@ define([
           "footnotes",
           "showdown-citation",
           "showdown-images",
+          "showdown-iframes",
         ];
 
         var numTestsTodo = SDextensions.length;
@@ -219,6 +220,8 @@ define([
           regexCitation = /\[@.+\]/;
         // test for any <h.> tags
         (regexHtags = new RegExp("#\\s")), (regexImages = /!\[.*\]\(\S+\)/);
+        // test for anything that looks like an iframe. Keep it very general.
+        const regexIframes = /<iframe.*?src="(.*?)"(.*?)><\/iframe>/g;
 
         // ================================================================
         // Test for and load each as required each showdown extension
@@ -342,6 +345,15 @@ define([
         } else {
           updateExtensionList("showdown-images", (required = false));
         }
+
+        // --- Test for iframes --- //
+        if (regexIframes.test(markdown)) {
+          require(["showdownIframes"], function (showdownIframes) {
+            updateExtensionList("showdown-iframes", (required = true));
+          });
+        } else {
+          updateExtensionList("showdown-iframes", (required = false));
+        }
       },
 
       /**