-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontent_extract.go
137 lines (118 loc) · 3.1 KB
/
content_extract.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
package expatczscraper
import (
"errors"
"fmt"
"net/http"
"slices"
"strings"
"golang.org/x/net/html"
)
const BaseURL = "https://www.expats.cz"
func GetArticleContent(link string) ([]Section, error) {
response, err := http.Get(link)
if err != nil {
return nil, err
}
defer response.Body.Close()
if response.StatusCode != http.StatusOK {
return nil, fmt.Errorf("HTTP request failed with status code %d", response.StatusCode)
}
htmlBody, err := html.Parse(response.Body)
if err != nil {
return nil, err
}
return ExtractArticleContentWithTitle(htmlBody), nil
}
func FindLinkWith(linkContains string, pageLink string) (string, error) {
response, err := http.Get(pageLink)
if err != nil {
return "", err
}
defer response.Body.Close()
if response.StatusCode != http.StatusOK {
return "", fmt.Errorf("HTTP request failed with status code %d", response.StatusCode)
}
htmlBody, err := html.Parse(response.Body)
if err != nil {
return "", err
}
l := FindFirstLinkWithContent(htmlBody, linkContains)
if l == "" {
return "", errors.New("no link found")
}
return BaseURL+l, err
}
func FindFirstLinkWithContent(n *html.Node, content string) string {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" && strings.Contains(a.Val, content) {
return a.Val
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if l := FindFirstLinkWithContent(c, content); l != "" {
return l
}
}
return ""
}
type Section struct {
Title string
Content string
}
func ExtractArticleContentWithTitle(n *html.Node) []Section {
if n.Type == html.ElementNode && n.Data == "div" && slices.ContainsFunc(n.Attr, containContentClass) {
ct := []Section{}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "div" && slices.ContainsFunc(c.Attr, containTitleClass) {
ct = append(ct, Section{
Title: extractText(c),
Content: extractContent(c.NextSibling),
})
}
}
if len(ct) > 0 {
return ct
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if ct := ExtractArticleContentWithTitle(c); len(ct) > 0 {
return ct
}
}
return nil
}
func containTextWrapperClass(a html.Attribute) bool {
return a.Key == "class" && strings.Contains(a.Val, "widget text")
}
func containTitleClass(a html.Attribute) bool {
return a.Key == "class" && strings.Contains(a.Val, "headinglevel2")
}
func containContentClass(a html.Attribute) bool {
return a.Key == "class" && strings.Contains(a.Val, "content")
}
func extractContent(n *html.Node) string {
r := ""
for c := n; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "div" && slices.ContainsFunc(c.Attr, containTextWrapperClass) {
r += extractText(c.FirstChild)
}
if c.Type == html.ElementNode && c.Data == "div" && slices.ContainsFunc(c.Attr, containTitleClass) {
break
}
}
return r
}
func extractText(n *html.Node) string {
if n.Type == html.TextNode {
return n.Data
}
r := ""
if n.Type == html.ElementNode {
for c := n.FirstChild; c != nil; c = c.NextSibling {
r += extractText(c)
}
}
return r
}