Skip to content
This repository has been archived by the owner on Oct 18, 2022. It is now read-only.

flake8 fixes #13

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,11 @@ Facebook [provides API access](https://www.facebook.com/ads/library/api) to the
For some countries (as of writing US, UK and Brazil), a [public report](https://www.facebook.com/ads/library/report/?source=archive-landing-page&country=GB) is available. Such reports make ads explorable through a web interface, and offer a CSV database as well. Facebook opened their Library for all EU countries mid-April. However, the public report is not available for most. This harvesting and re-exposure of data is thus currently the only way for people without a Facebook account to access the data. This work might thus not be as necessary when such reports are opened for all other countries.

However, we can also say that the data exposed by the API seems incomplete as of now. Indeed, the API reports only 2.900 political ads on Great Britain while the [corresponding report](https://www.facebook.com/ads/library/report/?source=archive-landing-page&country=GB) lists 49.000 ads. The scope of the data available using the API is undocumented and tracking this data may improve the shared understanding of that scope.

### Check the code with flake8

To run flake8 use tox :

```sh
tox
```
20 changes: 11 additions & 9 deletions facebook_fetch/fb_login/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,15 @@ def connect_facebook():
browser = mechanize.Browser()
cookies = http.cookiejar.LWPCookieJar()
browser.set_cookiejar(cookies)
browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.86 Chrome/73.0.3683.86 Safari/537.36')]
browser.addheaders = [
('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/73.0.3683.86 Chrome/73.0.3683.86 Safari/537.36')]
browser.set_handle_equiv(True)
browser.set_handle_gzip(True)
browser.set_handle_redirect(True)
browser.set_handle_referer(True)
browser.set_handle_robots(False)
browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
browser.set_handle_refresh(
mechanize._http.HTTPRefreshProcessor(), max_time=1)

# Login page
url = 'https://m.facebook.com/login.php'
Expand All @@ -90,16 +92,16 @@ def connect_facebook():


def get_user_token(browser):
#print(response.read())
#print(cookies)
# print(response.read())
# print(cookies)

#import sys, logging
#logger = logging.getLogger("mechanize")
#logger.addHandler(logging.StreamHandler(sys.stdout))
#logger.setLevel(logging.DEBUG)
#browser.set_debug_http(True)
#browser.set_debug_responses(True)
#browser.set_debug_redirects(True)
# logger.addHandler(logging.StreamHandler(sys.stdout))
# logger.setLevel(logging.DEBUG)
# browser.set_debug_http(True)
# browser.set_debug_responses(True)
# browser.set_debug_redirects(True)

browser.set_handle_redirect(False)
params = urllib.parse.urlencode({
Expand Down
4 changes: 3 additions & 1 deletion facebook_fetch/fb_login/serve_token.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

application = Flask(__name__)


@application.route('/')
def hello():
shared_secret = request.args.get('shared_secret')
Expand All @@ -14,5 +15,6 @@ def hello():
token, _ = connect_and_get_user_token()
return token


if __name__ == '__main__':
application.run()
application.run()
82 changes: 45 additions & 37 deletions facebook_fetch/fetch.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import json
import datetime
import os
import pathlib
import logging
import re
import time
Expand Down Expand Up @@ -29,39 +27,40 @@
'spend',
]
COUNTRIES = [
{'code': 'AT', 'page_size': 250}, # Austria
{'code': 'BE', 'page_size': 250}, # Belgium
{'code': 'BG', 'page_size': 250}, # Bulgaria
{'code': 'CY', 'page_size': 250}, # Cyprus
{'code': 'CZ', 'page_size': 250}, # Czechia
{'code': 'DE', 'page_size': 1000}, # Germany
{'code': 'DK', 'page_size': 250}, # Denmark
{'code': 'EE', 'page_size': 250}, # Estonia
{'code': 'ES', 'page_size': 250}, # Spain
{'code': 'FI', 'page_size': 250}, # Finland
{'code': 'FR', 'page_size': 250}, # France
{'code': 'GR', 'page_size': 250}, # Greece
{'code': 'HR', 'page_size': 250}, # Croatia
{'code': 'HU', 'page_size': 250}, # Hungary
{'code': 'IE', 'page_size': 250}, # Ireland
{'code': 'IT', 'page_size': 250}, # Italy
{'code': 'LT', 'page_size': 250}, # Lithuania
{'code': 'LU', 'page_size': 250}, # Luxembourg
{'code': 'LV', 'page_size': 250}, # Latvia
{'code': 'MT', 'page_size': 250}, # Malta
{'code': 'NL', 'page_size': 250}, # Netherlands
{'code': 'PL', 'page_size': 250}, # Poland
{'code': 'PT', 'page_size': 250}, # Portugal
{'code': 'RO', 'page_size': 250}, # Romania
{'code': 'SI', 'page_size': 250}, # Slovenia
{'code': 'SE', 'page_size': 250}, # Sweden
{'code': 'SK', 'page_size': 250}, # Slovakia
{'code': 'GB', 'page_size': 250}, # United Kingdom
# {'code': 'US', 'page_size': 2000}, # United States of America
{'code': 'AT', 'page_size': 250}, # Austria
{'code': 'BE', 'page_size': 250}, # Belgium
{'code': 'BG', 'page_size': 250}, # Bulgaria
{'code': 'CY', 'page_size': 250}, # Cyprus
{'code': 'CZ', 'page_size': 250}, # Czechia
{'code': 'DE', 'page_size': 1000}, # Germany
{'code': 'DK', 'page_size': 250}, # Denmark
{'code': 'EE', 'page_size': 250}, # Estonia
{'code': 'ES', 'page_size': 250}, # Spain
{'code': 'FI', 'page_size': 250}, # Finland
{'code': 'FR', 'page_size': 250}, # France
{'code': 'GR', 'page_size': 250}, # Greece
{'code': 'HR', 'page_size': 250}, # Croatia
{'code': 'HU', 'page_size': 250}, # Hungary
{'code': 'IE', 'page_size': 250}, # Ireland
{'code': 'IT', 'page_size': 250}, # Italy
{'code': 'LT', 'page_size': 250}, # Lithuania
{'code': 'LU', 'page_size': 250}, # Luxembourg
{'code': 'LV', 'page_size': 250}, # Latvia
{'code': 'MT', 'page_size': 250}, # Malta
{'code': 'NL', 'page_size': 250}, # Netherlands
{'code': 'PL', 'page_size': 250}, # Poland
{'code': 'PT', 'page_size': 250}, # Portugal
{'code': 'RO', 'page_size': 250}, # Romania
{'code': 'SI', 'page_size': 250}, # Slovenia
{'code': 'SE', 'page_size': 250}, # Sweden
{'code': 'SK', 'page_size': 250}, # Slovakia
{'code': 'GB', 'page_size': 250}, # United Kingdom
# {'code': 'US', 'page_size': 2000}, # United States of America
]


class FacebookToken():
_delay = datetime.timedelta(0, 30*60) # 30 minutes
_delay = datetime.timedelta(0, 30*60) # 30 minutes

def __init__(self):
self._timestamp = None
Expand All @@ -87,10 +86,14 @@ def token(self):
return self._token


AD_ID_REGEX = re.compile(r'^https://www\.facebook\.com/ads/archive/render_ad/\?id=(\d+)&access_token=[a-zA-Z0-9]+$')
AD_ID_REGEX = re.compile(
r'^https://www\.facebook\.com/ads/archive/render_ad/\?id=(\d+)&access_token=[a-zA-Z0-9]+$')


def get_ad_id(ad):
return AD_ID_REGEX.match(ad['ad_snapshot_url']).groups()[0]


def fetch(country_code, page_size, token):
def make_request(token_value, after=None):
ADS_API_URL = "https://graph.facebook.com/v3.3/ads_archive"
Expand Down Expand Up @@ -120,7 +123,8 @@ def make_request(token_value, after=None):
timeout=60,
)

assert response.status_code == 200, (response.status_code, response.text)
assert response.status_code == 200, (
response.status_code, response.text)

except Exception as exception:
logging.exception('Request failed')
Expand Down Expand Up @@ -184,7 +188,9 @@ def write_to_file(country_code, page_size, token):
print('Could not fetch ads for country {}'.format(country_code))
return

file_path = config.DATA_DIR / 'facebook/API' / country_code / 'facebook-ads-archive_{}_{}.json'.format(country_code, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
file_path = config.DATA_DIR / 'facebook/API' / country_code / \
'facebook-ads-archive_{}_{}.json'.format(
country_code, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

with open(file_path, 'w') as outfile:
json.dump(ads, outfile)
Expand All @@ -198,8 +204,10 @@ def create_dirs():
(config.DATA_DIR / 'facebook/API').mkdir(exist_ok=True)
(config.DATA_DIR / 'twitter').mkdir(exist_ok=True)
for country in COUNTRIES:
(config.DATA_DIR / 'facebook/API' / country['code']).mkdir(exist_ok=True)
(config.DATA_DIR / 'facebook/reports' / country['code']).mkdir(exist_ok=True)
(config.DATA_DIR / 'facebook/API' /
country['code']).mkdir(exist_ok=True)
(config.DATA_DIR / 'facebook/reports' /
country['code']).mkdir(exist_ok=True)


if __name__ == '__main__':
Expand Down
1 change: 0 additions & 1 deletion facebook_fetch/fill_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@


import json
import os

from pymongo import MongoClient

Expand Down
8 changes: 5 additions & 3 deletions facebook_fetch/graph.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

import pathlib
import datetime
import json

Expand All @@ -15,7 +14,8 @@ def build_graph():
for filename in sorted(list((config.DATA_DIR / 'facebook/API' / country_code).iterdir())):
fetch_datetime = datetime.datetime.strptime(
filename.stem,
'facebook-ads-archive_{}_%Y-%m-%d_%H-%M-%S'.format(country_code),
'facebook-ads-archive_{}_%Y-%m-%d_%H-%M-%S'.format(
country_code),
)

with open(filename, 'r') as f:
Expand All @@ -32,7 +32,8 @@ def build_graph():
with open(dir_path / 'data.json', 'r') as f:
data = json.load(f)

nb_ads = data['lifetime_data'][str(yesterday)]['payload']['totalAds']
nb_ads = data['lifetime_data'][str(
yesterday)]['payload']['totalAds']

reports_timeseries[date_str] = nb_ads

Expand All @@ -44,5 +45,6 @@ def build_graph():
with open(config.DATA_DIR / 'facebook/graph_nb_ads_facebook.json', 'w') as f:
json.dump(timeseries, f)


if __name__ == '__main__':
build_graph()
9 changes: 5 additions & 4 deletions facebook_fetch/put_snapshot_in_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ def process_batch():
client = MongoClient(config.MONGODB_URL)
ads_collection = client.facebook_ads.ads

#ads_collection.create_index('ad_id')
# ads_collection.create_index('ad_id')

user_access_token, browser = login.connect_and_get_user_token()

ads_to_process = ads_collection.find({ "snapshot" : { "$exists" : False } })
ads_to_process = ads_collection.find({"snapshot": {"$exists": False}})

for ad in ads_to_process:
ad_id = fetch.get_ad_id(ad)
Expand All @@ -37,7 +37,7 @@ def process_batch():
)

ads_collection.update_one(
{'_id': ad['_id'] },
{'_id': ad['_id']},
{
'$set': {
'ad_id': ad_id,
Expand All @@ -53,8 +53,9 @@ def process_batch():
time.sleep(5)
#user_access_token, browser = login.connect_and_get_user_token()


process_batch()
#while True:
# while True:
# try:
# process_batch()

Expand Down
5 changes: 2 additions & 3 deletions facebook_fetch/removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@

def compare_ad_lists(ads_old, ads_new):

print('Comparing {} old ads and {} new ads.'.format(len(ads_old), len(ads_new)))
print('Comparing {} old ads and {} new ads.'.format(
len(ads_old), len(ads_new)))
print()

for field in fetch.FIELDS:
Expand All @@ -18,7 +19,6 @@ def compare_ad_lists(ads_old, ads_new):
))
print()


# Index

def to_dict(ads):
Expand All @@ -32,7 +32,6 @@ def to_dict(ads):
ads_old_by_id = to_dict(ads_old)
ads_new_by_id = to_dict(ads_new)


# Find removed ads

old_ids = set(ads_old_by_id.keys())
Expand Down
14 changes: 9 additions & 5 deletions facebook_fetch/reports.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@

import json
import datetime
import pathlib
import logging
import time

Expand All @@ -23,7 +22,8 @@
'referer': 'https://www.facebook.com/ads/library/report/?source=archive-landing-page&country=FR',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/74.0.3729.169 Chrome/74.0.3729.169 Safari/537.36',
}
TIME_PRESETS = ['yesterday', 'last_7_days', 'last_30_days', 'last_90_days', 'lifelong']
TIME_PRESETS = ['yesterday', 'last_7_days',
'last_30_days', 'last_90_days', 'lifelong']


def fetch_endpoint(endpoint, date, country_code, time_preset=None):
Expand Down Expand Up @@ -79,6 +79,7 @@ def fetch_endpoint(endpoint, date, country_code, time_preset=None):
data = json.loads(json_data)
return data


def fetch_for_date_country(today, country_code):
dates = [
today - datetime.timedelta(delta)
Expand Down Expand Up @@ -113,13 +114,15 @@ def fetch_for_date_country(today, country_code):
},
}

report_dir = config.DATA_DIR / 'facebook' / 'reports' / country_code / str(today)
report_dir = config.DATA_DIR / 'facebook' / \
'reports' / country_code / str(today)
report_dir.mkdir()
with open(report_dir / 'data.json', 'w') as f:
json.dump(country_data, f)

for time_preset in TIME_PRESETS:
url = country_data['download'][time_preset][str(today)]['payload']['uri']
url = country_data['download'][time_preset][str(
today)]['payload']['uri']
if not url:
continue
filename = url.split('/')[-1].split('?')[0]
Expand All @@ -128,9 +131,10 @@ def fetch_for_date_country(today, country_code):
with open(report_dir / filename, 'wb') as f:
f.write(response.content)


def fetch_for_date(today):
for country in facebook_fetch.fetch.COUNTRIES:
country_code=country['code']
country_code = country['code']
fetch_for_date_country(today=today, country_code=country_code)


Expand Down
6 changes: 4 additions & 2 deletions google_fetch/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@


def write_to_file():
file_path = config.DATA_DIR / 'google/google-political-ads-transparency-bundle_{}.zip'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
file_path = config.DATA_DIR / 'google/google-political-ads-transparency-bundle_{}.zip'.format(
datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

response = requests.get('https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip')
response = requests.get(
'https://storage.googleapis.com/transparencyreport/google-political-ads-transparency-bundle.zip')
response.raise_for_status()
with open(file_path, 'wb') as f:
f.write(response.content)
Expand Down
6 changes: 4 additions & 2 deletions snapchat_fetch/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@


def write_to_file():
file_path = config.DATA_DIR / 'snapchat' / str(config.current_year) / 'snapchat-political-ads_{}_fetched-{}.zip'.format(config.current_year, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
file_path = config.DATA_DIR / 'snapchat' / str(config.current_year) / 'snapchat-political-ads_{}_fetched-{}.zip'.format(
config.current_year, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

response = requests.get('https://storage.googleapis.com/ad-manager-political-ads-dump/political/{}/PoliticalAds.zip'.format(config.current_year))
response = requests.get(
'https://storage.googleapis.com/ad-manager-political-ads-dump/political/{}/PoliticalAds.zip'.format(config.current_year))
response.raise_for_status()
with open(file_path, 'wb') as f:
f.write(response.content)
Expand Down
Loading