60 lines
1.4 KiB
Python
60 lines
1.4 KiB
Python
from re import findall
|
|
from typing import List
|
|
|
|
from bs4 import BeautifulSoup
|
|
from requests import get
|
|
|
|
from constants import FLICKR_URL, DATASET_URL
|
|
|
|
|
|
def format_url(dataset) -> str:
|
|
"""
|
|
Constructs the API's URL for the requested dataset
|
|
"""
|
|
link = DATASET_URL.format(dataset)
|
|
return link
|
|
|
|
|
|
def request_dataset(dataset):
|
|
"""
|
|
Fetches the requested dataset from opendata's API
|
|
Raises an exception if there's an HTTP error
|
|
"""
|
|
url = format_url(dataset)
|
|
response = get(url)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data
|
|
|
|
|
|
def request_flickr(keywords) -> str:
|
|
"""
|
|
Returns the HTML of a Flickr search
|
|
"""
|
|
search_url = FLICKR_URL.format(keywords)
|
|
result = get(search_url)
|
|
html = result.text
|
|
return html
|
|
|
|
|
|
def extract_urls(images):
|
|
"""
|
|
Creates proper URLs from the regex matches
|
|
"""
|
|
links = findall("(live.staticflickr.com/\S+.jpg)", str(images))
|
|
formatted_urls = ["https://" + link for link in links]
|
|
return formatted_urls
|
|
|
|
|
|
def scrape_flickr(keywords) -> List[str]:
|
|
"""
|
|
Creates a list of image links from a Flickr search
|
|
"""
|
|
html = request_flickr(keywords)
|
|
soup = BeautifulSoup(html, features="html.parser")
|
|
images = soup.find_all(
|
|
"div", class_="view photo-list-photo-view requiredToShowOnServer awake",
|
|
)
|
|
links = extract_urls(images)
|
|
return links
|