Skip to content

Commit

Permalink
Merge branch 'mediapattern'
Browse files Browse the repository at this point in the history
  • Loading branch information
osvik committed Oct 20, 2017
2 parents 13dec08 + baa25df commit 4a96dd9
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ canonicals.csv
httpResponses.csv
redirects.csv
cssjspattern.csv
linkpattern.csv
linkpattern.csv
mediapattern.csv
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ The urls file, by default `urls.csv` must have all the urls you want to check. Y
To check all urls in `urls.csv` with all the checks use the command:

```
./check-my-pages -urls=urls.csv -http -analytics -canonical -redirects -linkpattern -cssjspattern
./check-my-pages -urls=urls.csv -http -analytics -canonical -redirects -linkpattern -cssjspattern -mediapattern
```

This repository includes a few testing urls in the file `urls.csv`. Please replace them by your own.
Expand All @@ -38,7 +38,8 @@ It will create a couple of files, one per check the script is doing:
* `canonicals.csv` - Reports the **canonical url** for every url
* `redirects.csv` - Reports the requested URL and the final URL. This will be useful to test the **redirects** in the main site.
* `linkpattern.csv` - Reports on links that include a regular expression pattern. Useful to track **links** to specific **dead sites**. The default pattern can be set by the `-pattern` option.
* `cssjspattern.csv` - Reports css and js urls that include a regular expression pattern. To detect dead css and js urls in large sites. The pattern can also be ddefined with the option `-pattern` (described bellow)
* `cssjspattern.csv` - Reports **css and js** urls that include a regular expression pattern. To detect dead css and js urls in large sites. The pattern can also be defined with the option `-pattern` (described bellow)
* `mediapattern.csv` - Reports **media** links. Images, videos, audios, iframes and objects. Also use `-pattern` to define the urls pattern.

## Optional command line configurations

Expand Down
71 changes: 71 additions & 0 deletions check-my-pages.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ func main() {
isCanonical := flag.Bool("canonical", false, "Canonical URLS in the ")
isLinkpattern := flag.Bool("linkpattern", false, "Link Pattern")
isCSSJsPattern := flag.Bool("cssjspattern", false, "CSS and JS Pattern")
isMediaPattern := flag.Bool("mediapattern", false, "Image, object and iframe Pattern")
pattern := flag.String("pattern", `https?://(\w|-)+.greenpeace.org/espana/.+`, "Regular expression to detect in the links")
waitMiliseconds := flag.Int("miliseconds", 100, "Miliseconds between requests")
isClear := flag.Bool("clear", false, "Remove files created by this script")
Expand Down Expand Up @@ -137,6 +138,75 @@ func main() {
})
}

if *isMediaPattern == true {

mediaPattern, mediaPatternErr := os.OpenFile("mediapattern.csv", os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600)
if mediaPatternErr != nil {
panic(mediaPatternErr)
}
defer mediaPattern.Close()

c.OnHTML("img, picture source", func(e *colly.HTMLElement) {
src := e.Attr("src")
srcset := e.Attr("srcset")
if src == "" && srcset != "" {
src = srcset
}
if linkRegex.MatchString(src) {
lineMediaPattern := fmt.Sprintf("%s,img,%s\n", e.Request.URL.String(), src)
if _, err := mediaPattern.WriteString(lineMediaPattern); err != nil {
panic(err)
}
}

})

c.OnHTML("video, video source", func(e *colly.HTMLElement) {
src := e.Attr("src")
if linkRegex.MatchString(src) {
lineMediaPattern := fmt.Sprintf("%s,video,%s\n", e.Request.URL.String(), src)
if _, err := mediaPattern.WriteString(lineMediaPattern); err != nil {
panic(err)
}
}

})

c.OnHTML("audio, audio source", func(e *colly.HTMLElement) {
src := e.Attr("src")
if linkRegex.MatchString(src) {
lineMediaPattern := fmt.Sprintf("%s,audio,%s\n", e.Request.URL.String(), src)
if _, err := mediaPattern.WriteString(lineMediaPattern); err != nil {
panic(err)
}
}

})

c.OnHTML("iframe", func(e *colly.HTMLElement) {
src := e.Attr("src")
if linkRegex.MatchString(src) {
lineMediaPattern := fmt.Sprintf("%s,iframe,%s\n", e.Request.URL.String(), src)
if _, err := mediaPattern.WriteString(lineMediaPattern); err != nil {
panic(err)
}
}

})

c.OnHTML("object", func(e *colly.HTMLElement) {
src := e.Attr("data")
if linkRegex.MatchString(src) {
lineMediaPattern := fmt.Sprintf("%s,object,%s\n", e.Request.URL.String(), src)
if _, err := mediaPattern.WriteString(lineMediaPattern); err != nil {
panic(err)
}
}

})

}

if *isRedirects == true {

redirects, redirectsErr := os.OpenFile("redirects.csv", os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0600)
Expand Down Expand Up @@ -168,6 +238,7 @@ func main() {
os.Remove("redirects.csv")
os.Remove("linkpattern.csv")
os.Remove("cssjspattern.csv")
os.Remove("mediapattern.csv")
os.Exit(0)
}

Expand Down

0 comments on commit 4a96dd9

Please sign in to comment.