-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvideo.py
86 lines (81 loc) · 3.86 KB
/
video.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import requests, json, csv, os, shutil
from lxml import etree
from bs4 import BeautifulSoup
from datetime import datetime
from pytz import timezone
from scripts import *
cwd = os.getcwd()
pacific = timezone('America/Los_Angeles')
videoSubcats = "31994433,32058759,32058196,32003307,32058816,32042463,32042459,32058748,31994425,32058825,32042583,32042464,32058820,32058774,32042460,32003311,32058826,32058162"
items = 9999
startDate = pacific.localize(datetime(2018,1,1,0,0,1))
endDate = pacific.localize(datetime(2018,3,31,11,59,59))
def writeVideoXML(stories):
for story in stories:
# Do some string cleaning for YouTube ID
vid = story['video']
vidTest = False
if (re.search('youtube\.com',vid)):
vid = vid.split('=')[1]
vidTest = True
elif (re.search('youtu\.be',vid)):
vid = vid.split('/')[3]
vidTest = True
if (vidTest == True):
# Do date checking
dt = getDatetime(story['published'])
if (startDate <= dt <= endDate):
print("Getting {0}: {1} | {2}".format(story['headline'],story['path'],vid))
article = etree.Element('article')
# article story metadata
uniqueid = etree.SubElement(article,'uniqueid')
uniqueid.text = story['id']
title = etree.SubElement(article,'title')
title.text = etree.CDATA(story['headline'])
byline = etree.SubElement(article, 'byline')
byline.text = etree.CDATA(story['byline'])
date = etree.SubElement(article,'pubdate')
date.text = etree.CDATA(dt.isoformat())
# Create folder structure
dtDIR = dt.strftime('%Y/%m/%d')
filePath = '{0}/video/{1}/'.format(cwd, dtDIR)
createFolders(filePath)
# Come back to XML
category = etree.SubElement(article,'category')
category.text = "VIDEO"
taxonomies = etree.SubElement(article,'taxonomies')
taxonomy = etree.SubElement(taxonomies,'taxonomy')
taxonomy.text = "630"
images = etree.SubElement(article, 'images')
image = etree.SubElement(images, 'image')
title = etree.SubElement(image,'title')
title.text = ""
caption = etree.SubElement(image,'caption')
caption.text = ""
credit = etree.SubElement(image,'credit')
credit.text = ""
filename = etree.SubElement(image,'filename')
filename.text = "{0}.jpg".format(vid)
vidPic = "https://img.youtube.com/vi/{0}/maxresdefault.jpg".format(vid)
getImage(vidPic, filePath,vid)
seo = etree.SubElement(article,'seo-label')
# seoRegex = r'http://registerguard.com(\/.*\.html.csp)'
# seo.text = re.search(seoRegex,story['path'])[1]
seoRegex = r'http\:\/\/registerguard\.com(\/.*\.csp)'
try:
seo.text = re.search(seoRegex,story['path'])[1]
except:
print("--- ERROR ERROR --- {0}".format(story['path']))
video = etree.SubElement(article,'video-id')
video.text = vid
# Move into exporting to file
# print(etree.tostring(article, pretty_print=True))
out = etree.ElementTree(article)
outFILE = '{0}/{1}-{2}.xml'.format(filePath, dt.strftime('%Y%m%d'), story['id'])
out.write(outFILE, pretty_print=True, xml_declaration=True, encoding='utf-8')
def main():
stories = getStories(videoSubcats, items)
writeVideoXML(stories)
# Write those stories to a CSV (stories var, name of output csv)
#storyCSV(stories,'video2.csv')
main()