Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bloomfilter support base on the redis backend #8

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 120 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

### Go template
# Binaries for programs and plugins
*.exe
*.exe~
*.dll
*.so
*.dylib

# Test binary, built with `go test -c`
*.test

# Output of the go coverage tool, specifically when used with LiteIDE
*.out

# Dependency directories (remove the comment below to include it)
# vendor/

### macOS template
# General
.DS_Store
.AppleDouble
.LSOverride

# Icon must end with two \r
Icon

# Thumbnails
._*

# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent

# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk

.idea/*

39 changes: 34 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,21 @@ This is a redis based storage backend for [Colly](https://github.com/gocolly/col

[![GoDoc](https://godoc.org/github.com/gocolly/redisstorage?status.svg)](https://godoc.org/github.com/gocolly/redisstorage)

## Install

## Redis Storage

### Install

```
go get -u github.com/gocolly/redisstorage
```


## Usage
### Usage

```go
import (
"github.com/gocolly/colly"
"github.com/gocolly/colly/v2"
"github.com/gocolly/redisstorage"
)
```
Expand All @@ -31,12 +34,38 @@ storage := &redisstorage.Storage{
Prefix: "job01",
}

err := c.SetStorage(storage)
if err != nil {
if err := c.SetStorage(storage); err != nil{
panic(err)
}
```

## RedisBloomFilterStorage

```go
import (
"github.com/gocolly/colly/v2"
"github.com/gocolly/redisstorage"
)
```


```go
c := colly.NewCollector()

storage := &redisstorage.RedisBloomFilterStorage{
Storage: &Storage{
Address: "127.0.0.1:6379",
Password: "",
DB: 0,
Prefix: "job01",
}
}

if err := c.SetStorage(storage); err != {
panic(err)
}

```

## Bugs

Expand Down
48 changes: 48 additions & 0 deletions _examples/redis_bloomfilter_storage.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package main

import (
"fmt"

"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
"github.com/gocolly/redisstorage"
)

func main() {
c := colly.NewCollector()
//c.AllowURLRevisit = true
redisbloomfilterstorage := &redisstorage.RedisBloomFilterStorage{
Storage: &redisstorage.Storage{
Address: "127.0.0.1:6379",
Password: "",
DB: 0,
Prefix: "bl",
},
}
if err := redisbloomfilterstorage.Init(); err != nil {
panic(err)
}

if err := c.SetStorage(redisbloomfilterstorage); err != err {
panic(err)
}
// close redis client
defer redisbloomfilterstorage.Client.Close()

defer redisbloomfilterstorage.Clear()

q, err := queue.New(10, redisbloomfilterstorage)
if err != nil {
return
}

c.OnResponse(func(response *colly.Response) {
fmt.Println(string(response.Body))
})

for i := 0; i < 10<<10; i++ {
q.AddURL(fmt.Sprintf("%s?x=%v", `https://httpbin.org/delay/1`, i))
}

q.Run(c)
}
41 changes: 41 additions & 0 deletions _examples/redis_storage.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package main

import (
"fmt"

"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
"github.com/gocolly/redisstorage"
)

func main() {
c := colly.NewCollector()

storage := &redisstorage.Storage{
Address: "127.0.0.1:6379",
Password: "",
DB: 0,
Prefix: "job01",
}

if err := c.SetStorage(storage); err != err {
panic(err)
}
// close redis client
//defer storage.Client.Close()

q, err := queue.New(10, storage)
if err != nil {
return
}

c.OnResponse(func(response *colly.Response) {
fmt.Println(string(response.Body))
})

for i := 0; i < 10; i++ {
q.AddURL(fmt.Sprintf("%s?x=%v", `https://httpbin.org/delay/1`, i))
}

q.Run(c)
}
Loading