Skip to content

Commit

Permalink
Add docstrings and reformat
Browse files Browse the repository at this point in the history
  • Loading branch information
glenn-jocher committed Apr 28, 2024
1 parent cb18f47 commit 6e6a17e
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 27 deletions.
5 changes: 3 additions & 2 deletions flickr_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import argparse
import os
import time

from flickrapi import FlickrAPI

from utils.general import download_uri
Expand All @@ -13,6 +12,7 @@


def get_urls(search="honeybees on flowers", n=10, download=False):
"""Fetch Flickr URLs for `search` term images, optionally downloading them; supports up to `n` images."""
t = time.time()
flickr = FlickrAPI(key, secret)
license = () # https://www.flickr.com/services/api/explore/?method=flickr.photos.licenses.getInfo
Expand All @@ -36,7 +36,8 @@ def get_urls(search="honeybees on flowers", n=10, download=False):
# construct url https://www.flickr.com/services/api/misc.urls.html
url = photo.get("url_o") # original size
if url is None:
url = f"https://farm{photo.get('farm')}.staticflickr.com/{photo.get('server')}/{photo.get('id')}_{photo.get('secret')}_b.jpg"
url = (f"https://farm{photo.get('farm')}.staticflickr.com/{photo.get('server')}/"
f"{photo.get('id')}_{photo.get('secret')}_b.jpg")

# download
if download:
Expand Down
22 changes: 12 additions & 10 deletions utils/clean_images.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
import argparse
import cv2
import glob
import numpy as np
import os
from PIL import Image
from multiprocessing.pool import ThreadPool
from pathlib import Path

import cv2
import numpy as np
from PIL import Image
from tqdm import tqdm


def scan(files, max_wh=1920, remove=False, multi_thread=True, tojpg=False, quality=95, workers=8):
# Args:
# files: list of image files
# max_wh: maximum image wh (larger images will be reduced in size)
# remove: delete corrupted/duplicate images
# tojpg: replace current image with jpg for smaller size / faster loading
# quality: PIL JPG saving quality (0-100)
"""Scans and processes images by resizing, converting to jpg, and removing duplicates or corrupt files.
Args:
files: list of image files
max_wh: maximum image wh (larger images will be reduced in size)
remove: delete corrupted/duplicate images
tojpg: replace current image with jpg for smaller size / faster loading
quality: PIL JPG saving quality (0-100)
"""
img_formats = [".bmp", ".jpg", ".jpeg", ".png", ".tif", ".tiff", ".dng"] # valid image formats from YOLOv5

def scan_one_file(f):
Expand Down
5 changes: 2 additions & 3 deletions utils/flickr_scraper_noapi.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import urllib.request

import numpy as np
import urllib.request

fp = urllib.request.urlopen("https://www.flickr.com/search?text=flowers&structured=yes&page=2")

Expand All @@ -16,7 +15,7 @@
res = [i for i in range(len(str)) if str.startswith("_b.jpg", i)]
a = []
for i in res:
s = "https://" + str[i - 70 : i + 6].replace("\\", "").split("//")[-1]
s = "https://" + str[i - 70: i + 6].replace("\\", "").split("//")[-1]
a.append(s)
a = list(np.unique(np.array(a)))
print(len(a), a)
5 changes: 2 additions & 3 deletions utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
# Written by Glenn Jocher ([email protected]) for https://github.com/ultralytics

import os
from pathlib import Path

import requests
from PIL import Image
from pathlib import Path


def download_uri(uri, dir="./"):
# Download a file from a given URI, including minimal checks
"""Downloads file from URI, performing checks and renaming; supports timeout and image format suffix addition."""

# Download
f = dir + os.path.basename(uri) # filename
Expand Down
18 changes: 9 additions & 9 deletions utils/multithread_example.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
#!/usr/bin/env python
from multiprocessing.pool import ThreadPool
from time import time as timer
from urllib import request

from tqdm import tqdm
from urllib import request

dir = "../" # directory to save image downloads
urls = [
"https://farm8.staticflickr.com/7428/27138770446_6618c10ffb_b.jpg",
"https://live.staticflickr.com/4571/37795143414_8ccae77768_o.jpg",
"https://live.staticflickr.com/1732/27535176747_78b83536af_o.jpg",
"https://live.staticflickr.com/331/18765122504_ea8c9ea6ce_o.jpg",
"https://live.staticflickr.com/1919/44312457665_6f7b6c2c42_o.jpg",
"https://farm4.staticflickr.com/3597/3359921429_fc86a7519e_b.jpg",
] * 10
"https://farm8.staticflickr.com/7428/27138770446_6618c10ffb_b.jpg",
"https://live.staticflickr.com/4571/37795143414_8ccae77768_o.jpg",
"https://live.staticflickr.com/1732/27535176747_78b83536af_o.jpg",
"https://live.staticflickr.com/331/18765122504_ea8c9ea6ce_o.jpg",
"https://live.staticflickr.com/1919/44312457665_6f7b6c2c42_o.jpg",
"https://farm4.staticflickr.com/3597/3359921429_fc86a7519e_b.jpg",
] * 10


def fetch_url(url):
"""Downloads a file from a URL to a local directory, returning the URL and any error encountered."""
try:
f = dir + url.split("/")[-1]
request.urlretrieve(url, f)
Expand Down

0 comments on commit 6e6a17e

Please sign in to comment.