-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtweet_downloader.py
97 lines (72 loc) · 3.57 KB
/
tweet_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import sys
import time
import json
import requests
import argparse
import urllib
# Ugly hack to use twitterwebsearch as git submodule
TWS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'twitterwebsearch')
sys.path.append(TWS_DIR)
from twitterwebsearch.parser import parse_search_results
TWITTER_PROFILE_URL = 'https://twitter.com/{term}'
TWITTER_PROFILE_MORE_URL = 'https://twitter.com/i/profiles/show/{term}/timeline?include_available_features=1&include_entities=1&max_position={max_position}'
TWITTER_SEARCH_URL = 'https://twitter.com/search?q={term}&src=typd'
TWITTER_SEARCH_MORE_URL = 'https://twitter.com/i/search/timeline?q={term}&src=typd&vertical=default&include_available_features=1&include_entities=1&max_position={max_position}'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36 Edge/12.0'
def find_value(html, key):
pos_begin = html.find(key) + len(key) + 2
pos_end = html.find('"', pos_begin)
return html[pos_begin: pos_end]
def download_tweets(search=None, profile=None, sleep=1):
assert search or profile
term = urllib.quote_plus(search or profile)
url = TWITTER_SEARCH_URL if search else TWITTER_PROFILE_URL
url_more = TWITTER_SEARCH_MORE_URL if search else TWITTER_PROFILE_MORE_URL
response = requests.get(url.format(term=term), headers={'User-agent': USER_AGENT}).text
max_position = find_value(response, 'data-max-position')
min_position = find_value(response, 'data-min-position')
for tweet in parse_search_results(response.encode('utf8')):
yield tweet
has_more_items = True
while has_more_items:
response = requests.get(url_more.format(term=term, max_position=min_position), headers={'User-agent': USER_AGENT}).text
response_dict = json.loads(response)
min_position = response_dict['min_position']
has_more_items = response_dict['has_more_items'] if profile else False
for tweet in parse_search_results(response_dict['items_html'].encode('utf8')):
yield tweet
if search:
has_more_items = True
time.sleep(sleep)
def main(argv):
parser = argparse.ArgumentParser(add_help=False, description=('Download tweets without using the Twitter API'))
parser.add_argument('--help', '-h', action='help', default=argparse.SUPPRESS, help='Show this help message and exit')
parser.add_argument('--search', '-s', type=str, help='Search query (e.g., "\\"london news\\" since:2012-04-01 until:2012-07-31)"')
parser.add_argument('--profile', '-p', help='Profile query')
parser.add_argument('--output', '-o', help='Output filename')
try:
args = parser.parse_args(argv)
search = args.search
profile = args.profile
output = args.output
if bool(search) == bool(profile):
parser.print_usage()
raise ValueError('you need to use either the search or the profile option')
if not output:
parser.print_usage()
raise ValueError('you need to specify an output filename')
print 'Downloading', (search or profile)
count = 0
with open(output, 'wb') as fp:
for tweet in download_tweets(search=search, profile=profile):
print >> fp, json.dumps(tweet)
count += 1
sys.stdout.write('Fetched %d tweet(s)\r' % count)
sys.stdout.flush()
print '\nDone!'
except Exception, e:
print 'Error:', str(e)
sys.exit(1)
if __name__ == "__main__":
main(sys.argv[1:])