forked from pingcap/discourse
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplain_text_to_markdown.rb
194 lines (150 loc) · 4.71 KB
/
plain_text_to_markdown.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# frozen_string_literal: true
class PlainTextToMarkdown
SIGNATURE_SEPARATOR ||= "-- ".freeze
def initialize(plaintext, opts = {})
@plaintext = plaintext
@lines = []
@format_flowed = opts[:format_flowed] || false
@delete_flowed_space = opts[:delete_flowed_space] || false
end
def to_markdown
prepare_lines
classify_lines
markdown = +""
last_quote_level = 0
last_line_blank = false
@lines.each do |line|
current_line_blank = line.text.blank?
unless last_line_blank && current_line_blank
if line.quote_level > 0
quote_identifiers = ">" * line.quote_level
markdown << quote_identifiers << "\n" unless line.quote_level >= last_quote_level || current_line_blank
markdown << quote_identifiers
markdown << " " unless current_line_blank
else
markdown << "\n" unless last_quote_level == 0 || current_line_blank
end
markdown << convert_text(line)
markdown << "\n"
end
last_line_blank = current_line_blank
last_quote_level = line.quote_level
end
markdown.rstrip!
markdown
end
private
class CodeBlock < Struct.new(:start_line, :end_line)
def initialize(start_line, end_line = nil)
super
end
def valid?
start_line.present? && end_line.present?
end
end
class Line < Struct.new(:text, :quote_level, :code_block)
def initialize(text, quote_level = 0, code_block = nil)
super
end
def valid_code_block?
code_block&.valid?
end
end
def prepare_lines
previous_line = nil
@plaintext.each_line do |text|
text.chomp!
line = Line.new(text)
remove_quote_level_indicators!(line)
if @format_flowed
line = merge_lines(line, previous_line)
@lines << line unless line == previous_line
else
@lines << line
end
previous_line = line
end
end
def classify_lines
previous_line = nil
@lines.each do |line|
classify_line_as_code!(line, previous_line)
previous_line = line
end
end
# @param line [Line]
def remove_quote_level_indicators!(line)
match_data = line.text.match(/^(?<indicators>>+)\s?(?<text>.*)/)
if match_data
line.text = match_data[:text]
line.quote_level = match_data[:indicators].length
end
end
# @param line [Line]
# @param previous_line [Line]
# @return [Line]
def merge_lines(line, previous_line)
return line if previous_line.nil? || line.text.blank?
return line if line.text == SIGNATURE_SEPARATOR || previous_line.text == SIGNATURE_SEPARATOR
return line unless line.quote_level == previous_line.quote_level && previous_line.text.end_with?(" ")
previous_line.text = previous_line.text[0...-1] if @delete_flowed_space
previous_line.text += line.text
previous_line
end
# @param line [Line]
# @param previous_line [Line]
def classify_line_as_code!(line, previous_line)
line.code_block = previous_line.code_block unless previous_line.nil? || previous_line.valid_code_block?
return unless line.text =~ /^\s{0,3}```/
if line.code_block.present?
line.code_block.end_line = line
else
line.code_block = CodeBlock.new(line)
end
end
# @param line [Line]
# @return [string]
def convert_text(line)
text = line.text
if line.valid_code_block?
code_block = line.code_block
return code_block.start_line == line || code_block.end_line == line ? text.lstrip : text
end
converted_text = replace_duplicate_links(text)
converted_text = escape_special_characters(converted_text)
converted_text = indent_with_non_breaking_spaces(converted_text)
converted_text
end
URL_REGEX ||= URI.regexp(%w{http https ftp mailto})
BEFORE ||= Regexp.escape(%Q|([<«"“'‘|)
AFTER ||= Regexp.escape(%Q|)]>»"”'’|)
def replace_duplicate_links(text)
urls = Set.new
text.scan(URL_REGEX) { urls << $& }
urls.each do |url|
escaped = Regexp.escape(url)
text.gsub!(Regexp.new(%Q|#{escaped}\s*[#{BEFORE}]?#{escaped}[#{AFTER}]?|, Regexp::IGNORECASE), url)
end
text
end
def indent_with_non_breaking_spaces(text)
text.sub(/^\s+/) do |s|
# replace tabs with 2 spaces
s.gsub!("\t", " ")
# replace indentation with non-breaking spaces
s.length > 1 ? " " * s.length : s
end
end
def escape_special_characters(text)
urls = Set.new
text.scan(URL_REGEX) { urls << $& }
hoisted = urls
.map { |url| [SecureRandom.hex, url] }
.to_h
hoisted.each { |h, url| text.gsub!(url, h) }
text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" }
text = CGI.escapeHTML(text)
hoisted.each { |h, url| text.gsub!(h, url) }
text
end
end