-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtwitter_url_extractor.rb
executable file
·87 lines (79 loc) · 2.21 KB
/
twitter_url_extractor.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env ruby
# encoding: UTF-8
# Name:
# twitter_url_extractor.rb
#
# Purpose:
# This is a small tool for extracting HTTP URIs from a search of Twitter
# hashtags. Given a hashtag, it returns a list of unique links found within
# the 100 most-recent matching tweets (if any).
#
# Usage:
# ruby twitter_url_extractor.rb <hashtag>
#
# Options:
# none
#
# Exit Status Codes:
# 0 = Success
# 64 = Command line usage error
#
# Copyright:
# Copyright 2012 Todd A. Jacobs
# All Rights Reserved
#
# License:
# Released under the GNU General Public License (GPL)
# http://www.gnu.org/copyleft/gpl.html
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
require 'bundler/setup'
require 'twitter'
RECORD_LIMIT = 100
URL_PATTERN = %r{
\b
(
(?: [a-z][\w-]+:
(?: /{1,3} | [a-z0-9%] ) |
www\d{0,3}[.] |
[a-z0-9.\-]+[.][a-z]{2,4}/
)
(?:
[^\s()<>]+ | \(([^\s()<>]+|(\([^\s()<>]+\)))*\)
)+
(?:
\(([^\s()<>]+|(\([^\s()<>]+\)))*\) |
[^\s`!()\[\]{};:'".,<>?«»“”‘’]
)
)
}ix
def usage
warn "Usage: ruby #{File.basename $0} <hashtag>"
exit 64
end
# Ensure that the hashtag has a hash symbol. This makes the leading '#'
# optional, which avoids the need to quote or escape it on the command line.
def format_hashtag(hashtag)
(hashtag.scan(/^#/).empty?) ? "##{hashtag}" : hashtag
end
# Return a sorted list of unique URLs found in the list of tweets.
def uniq_urls(tweets)
tweets.map(&:text).to_s.scan(URL_PATTERN).flatten.compact.uniq
end
def search(hashtag)
Twitter.search(hashtag, rpp: RECORD_LIMIT, result_type: 'recent')
end
if __FILE__ == $0
usage unless ARGV.size >= 1
hashtag = format_hashtag(ARGV[0])
tweets = search(hashtag)
puts uniq_urls(tweets)
end