Skip to content

Commit

Permalink
FIX: use URI.regexp to find URLs in plain text
Browse files Browse the repository at this point in the history
  • Loading branch information
ZogStriP committed Jun 6, 2019
1 parent 1f73a3b commit b1c5ea4
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 20 deletions.
41 changes: 21 additions & 20 deletions lib/plain_text_to_markdown.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
class PlainTextToMarkdown
SIGNATURE_SEPARATOR ||= "-- ".freeze

URL_REGEX ||= /((?:https?:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.])(?:[^\s()<>]+|\([^\s()<>]+\))+(?:\([^\s()<>]+\)|[^`!()\[\]{};:'".,<>?«»“”‘’\s]))/i

def initialize(plaintext, opts = {})
@plaintext = plaintext
@lines = []
Expand Down Expand Up @@ -150,15 +148,17 @@ def convert_text(line)
converted_text
end

URL_REGEX ||= URI.regexp(%w{http https ftp mailto})
BEFORE ||= Regexp.escape(%Q|([<«"“'‘|)
AFTER ||= Regexp.escape(%Q|)]>»"”'’|)

def replace_duplicate_links(text)
text.to_enum(:scan, URL_REGEX)
.map { $& }
.group_by { |url| url }
.keep_if { |_, urls | urls.length > 1 }
.keys.each do |url|
urls = Set.new
text.scan(URL_REGEX) { urls << $& }

urls.each do |url|
escaped = Regexp.escape(url)
text.gsub!(Regexp.new(%Q|#{escaped}(\s*[()\\[\\]<>«»'"“”‘’]?#{escaped}[()\\[\\]<>«»'"“”‘’]?)|, Regexp::IGNORECASE), url)
text.gsub!(Regexp.new(%Q|#{escaped}\s*[#{BEFORE}]?#{escaped}[#{AFTER}]?|, Regexp::IGNORECASE), url)
end

text
Expand All @@ -175,19 +175,20 @@ def indent_with_non_breaking_spaces(text)
end

def escape_special_characters(text)
escaped_text = +""
urls = Set.new
text.scan(URL_REGEX) { urls << $& }

text.split(URL_REGEX).each do |text_part|
if text_part =~ URL_REGEX
# no escaping withing URLs
escaped_text << text_part
else
# escape Markdown and HTML
text_part.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
escaped_text << CGI.escapeHTML(text_part)
end
end
hoisted = urls
.map { |url| [SecureRandom.hex, url] }
.to_h

escaped_text
hoisted.each { |h, url| text.gsub!(url, h) }

text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
text = CGI.escapeHTML(text)

hoisted.each { |h, url| text.gsub!(h, url) }

text
end
end
8 changes: 8 additions & 0 deletions spec/components/plain_text_to_markdown_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,14 @@ def to_markdown(text, opts = {})
expect(to_markdown("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz"))
.to eq("foo https://www.example.com/foo.html bar https://www.example.com/foo.html baz")
end

it "does not explode with weird links" do
expect {
Timeout::timeout(0.25) {
to_markdown("https://www.discourse.org/?boom=#{"." * 20}")
}
}.not_to raise_error
end
end

context "code" do
Expand Down

0 comments on commit b1c5ea4

Please sign in to comment.