-
-
Notifications
You must be signed in to change notification settings - Fork 75
/
Copy pathsaxy_html.rb
executable file
·113 lines (93 loc) · 3.28 KB
/
saxy_html.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env ruby
# This example demonstrates the use of the Ox.sax_html parser and the
# Ox.Builder. The parser is used to parse and HTML file and add a
# `class="ppp"` to each '<p>' element start.
#
# The approach taken is to build while parsing. An HTML parse is started and a
# builder call is made on each parser callback. If the element is a 'p' then
# the class attribute is added. All others remain the same.
# Use the current repo if run from the examples directory.
ox_dir = File.dirname(File.dirname(File.expand_path(__FILE__)))
$LOAD_PATH << File.join(ox_dir, 'ext')
$LOAD_PATH << File.join(ox_dir, 'lib')
require 'ox'
# First create a handler for the SAX callbacks. The class instances include a
# builder that builds as parsing takes place.
class Saxy < Ox::Sax
VOID_ELEMENTS = [:area, :base, :br, :col, :embed, :hr, :img, :input, :link, :meta, :param, :source, :track, :wbr]
def initialize
super
# The build is created with an indentation of 2 but that can be changed to
# the desired indentation.
@builder = Ox::Builder.new(indent: 2)
# element_name and attributes are used for deferred writing of the element
# start.
@element_name = nil
@attrs = {}
end
def to_s
@builder.to_s
end
# The builder creates element starts with attributes but the parser uses a
# seprate call for attributes and element starts. To deal with the
# difference keep track of the start name and attributes as they are
# added. When another callback other than attributes is called write any
# pending element start.
def push_element
return if @element_name.nil?
# Add the class attribute if the element is a <p> element.
@attrs[:class] = 'ppp' if :p == @element_name
# Check @void_elements to determine how the element start would be
# written. HTML includes void elements that are self closing so those
# should be handled correctly.
if VOID_ELEMENTS.include?(@element_name)
@builder.void_element(@element_name, @attrs)
else
@builder.element(@element_name, @attrs)
end
# Reset the element name.
@element_name = nil
@attrs = {}
end
def start_element(name)
push_element
@element_name = name
end
def attr(name, value)
@attrs[name] = value
end
def doctype(value)
push_element
@builder.doctype(value)
end
def comment(value)
push_element
@builder.comment(value)
end
def text(value)
push_element
@builder.text(value)
end
def end_element(name)
push_element
@builder.pop unless VOID_ELEMENTS.include?(name)
end
# Just in case there is a parse error this will display the error along with
# where the error occurred in the XML file.
def error(message, line, column)
puts "*-*-* error at #{line}:#{column}: #{message}"
end
end
# Load the XML file. The Ox.sax_html also handles IO objects.
xml = File.read('saxy.html')
# Create an instance of the handler.
handler = Saxy.new
Ox.sax_html(handler, xml)
# For debugging uncomment these lines.
# puts "******************** original *************************\n#{xml}"
# puts "******************** modifified ***********************\n#{handler.to_s}"
# For benchmarks these lines should be repeated to parse and to generate a
# modified XML string.
# handler = Saxy.new()
# Ox.sax_html(handler, xml)
# handler.to_s