forked from uf0/youtube-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetComments.py
92 lines (69 loc) · 3.08 KB
/
getComments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import requests,csv,argparse,sys,re
from config import *
def get_reply(activityID, apiKey, writer, video):
url = 'https://www.googleapis.com/plus/v1/activities/' + activityID + '/comments'
params = {}
params['key'] = apiKey
params['maxResults'] = 500
r = requests.get(url, params = params)
results = r.json()['items']
for comment in results:
row = []
published = comment['published']
title = ''
content = comment['object']['content'].encode('utf8') #TODO remove html content
author = comment['actor']['displayName'].encode('utf8')
reply = 0
row.extend([video, published, title, content, author, reply])
writer.writerow(row)
def get_comments(url, query_params, writer, video):
r = requests.get(url, params = query_params)
result = r.json()['feed']
comments = result['entry']
for comment in comments:
row = []
published = comment['published']['$t']
title = comment['title']['$t'].replace(u'\ufeff', '').strip()
title = title.encode('utf8')
title = re.sub(r'\n', '', title)
content = comment['content']['$t'].replace(u'\ufeff', '').strip()
content = content.encode('utf8')
content = re.sub(r'\n', '', content)
author = comment['author'][0]['name']['$t']
reply = comment['yt$replyCount']['$t']
row.extend([video, published, title, content, author.encode('utf8'), reply])
writer.writerow(row)
if 'yt$googlePlusUserId' in comment and reply > 0:
activityID = comment['id']['$t'].split('/')[-1]
get_reply(activityID, GOOGLE_API, writer, video)
if(result['link'][-1]['rel'] == 'next'):
next = result['link'][-1]['href']
get_comments(next, '', writer, video)
def main():
parser = argparse.ArgumentParser(description='getComments. Get all comments from Youtube videos')
parser.add_argument("-i", "--input", dest="input", help="the path to your .tsv file with videos ID", required=True)
parser.add_argument("-o", "--output", dest="output", default="output.tsv", help="the output file (.tsv file)")
options = parser.parse_args()
data = csv.DictReader(open(options.input, 'rb'), delimiter='\t', skipinitialspace=True)
site_root = 'https://gdata.youtube.com/feeds/api/videos/'
site_api = '/comments'
query_params = {}
query_params['orderby'] = 'published'
query_params['alt'] = 'json'
query_params['max-results'] = 50
data = list(data)
total = len(data)
writer = csv.writer(open(options.output, 'wb'), delimiter='\t', quotechar='"')
headers = ['id', 'published', 'title', 'content', 'author', 'reply']
writer.writerow(headers)
for i, line in enumerate(data):
video = line['id']
sys.stdout.write('\x1b[2K\r[' + str(i+1) + '/' + str(total) +'] ' + video)
sys.stdout.flush()
site_url = site_root + video + site_api
get_comments(site_url, query_params, writer, video)
sys.stdout.write('\x1b[2K\r')
sys.stdout.flush()
sys.exit()
if __name__ == '__main__':
main()