-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess-md.pl
executable file
·146 lines (123 loc) · 4.46 KB
/
preprocess-md.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/perl
use strict;
use warnings;
# This tells Perl that this Perl source file is encoded in UTF-8.
use utf8;
# This tells Perl to open STDIN, etc. as UTF-8. This is better than doing
# binmode STDIN, ':utf8' because it works even when STDIN is a file specified
# in @ARGV auto-open by the <> operator.
use open qw(:std :utf8);
use Perl6::Slurp;
my $newp = 1;
my $class;
my $long;
# Setting this to 1 will remove all non-normative material from the output.
my $only_norm = 0;
# This preprocessor now makes two passes over the input stream in order
# to handle forward references. @lines contains the list of lines,
# complete with \n terminators, between passes.
my @lines;
my @sect; # The current (sub)section numbers(s) while parsing.
my %labels; # Map of label name to section number.
my $bad_nesting; # Have we had any badly nested headings yet?
sub text($$) {
my ($start, $txt) = @_;
# Ignore preformatted lines
goto done_text if $txt =~ /^ {4}/;
# RFC 2119 keywords
my @rfc2119 = ('must not', 'must', 'required', 'shall not', 'shall',
'should not', 'should', 'not recommended', 'recommended',
'may', 'optional');
my $css = 'font-variant: small-caps';
# Handle uses of *must*, etc.
my $rfc2119lc = join '|', map {s/ /\\ /; $_} @rfc2119;
$txt =~ s/(\*{1,2})($rfc2119lc)\1/
my $open = length($1) == 2 ? '**' : '';
$open . "<span style=\"$css\">$2<\/span>" . $open
/gex;
# Handle uses of MUST, etc.
my $rfc2119uc = join '|', map {s/ /\ /; uc $_} @rfc2119;
$txt =~ s/(?<!`)\b($rfc2119uc)\b/
"<span style=\"$css\">".lc($1)."<\/span>"
/gex;
# Scan all headings (currently excluding ones with --- and === underlining)
# to keep track of section numbers.
if ($txt =~ /^(#+)\s/) {
my $l = length $1;
while ($l < scalar @sect) { pop @sect }
if ($l == scalar @sect) { ++$sect[$l-1] }
elsif ($l == 1 + scalar(@sect)) { push @sect, 1 }
else { $bad_nesting = 1 }
# Where there's a {#name} mark at the heading line (which pandoc will
# use to create an anchor), store it as a label for {§name} references
# handled in this preprocessor.
if ($txt =~ /{#(\S+)}\s*$/) {
if (exists $labels{$1}) { die "Duplicate label: $1" }
$labels{$1} = join('.', @sect[1..$#sect]);
}
}
# You can't nest the short form of [...] and `...` cleanly in pandoc
# markdown, so preprocess it here.
$txt =~ s/`\[([:\w#]+)\]`/[`$1`](#$1)/g;
# Markdown has a poorly documented "feature" whereby two spaces at
# the end of a line inserts a hard line break (<br/> or \\). Stop that.
$txt =~ s/\s{2,}$/ /;
done_text:
push @lines, "$start$txt\n";
}
while (<>) {
chomp;
# File inclusion: {#include filename}
if ($newp and s/^{#include\s*(.*?)}$//) {
my $txt = slurp($1) or die "Unable to read file '$1'";
$txt =~ s/^/ /gm;
push @lines, "$txt\n";
}
# Paragraph classes: {.class} and {.class ...} {/}
if ($newp and s/^{\.([a-z]+)(\s*\.{3})?}\s*//) {
$class = $1;
$long = $2 ? " long" : "";
unless ($only_norm) {
my $start = "<div class=\"fhiso-$1$long\">\\fhisoopenclass{$1}";
if (/\S/) { push @lines, $start; }
else { # Look ahead
$_ = <>; chomp;
if ($_ =~ /^ {4}/) { push @lines, "$start\n\n"; text "", $_ }
else { text $start, $_ }
$newp = /^\s*$/;
next;
}
}
}
if (defined $class and $long and /^(.*)\{\/\}\s*$/) {
unless ($only_norm) {
text "", $1;
push @lines, "\\fhisocloseclass{$class}</div>\n\n";
}
$class = undef; $newp = 1;
}
elsif (defined $class and not $long and /^\s*$/) {
unless ($only_norm) {
push @lines, "\\fhisocloseclass{$class}</div>\n\n";
}
$class = undef; $newp = 1;
}
else {
unless ($only_norm and defined $class) {
text "", $_;
}
$newp = /^\s*$/;
}
}
unless ($only_norm) {
push @lines, "\\fhisocloseclass{$class}</div>\n" if defined $class;
}
# Second pass over data handling references of the form {§name}
foreach my $line (@lines) {
$line =~ s/{§(\S+)}/
die "File has bad nesting" if $bad_nesting;
die "Unknown label '$1'" unless exists $labels{$1};
'§'.$labels{$1}
/gex;
print $line;
}