graphPaname/app/data_request.py

from re import findall
from typing import List

from bs4 import BeautifulSoup
from requests import get

from constants import FLICKR_URL, DATASET_URL


def format_url(dataset) -> str:
    """
    Constructs the API's URL for the requested dataset
    """
    link = DATASET_URL.format(dataset)
    return link


def request_dataset(dataset):
    """
    Fetches the requested dataset from opendata's API
    Raises an exception if there's an HTTP error
    """
    url = format_url(dataset)
    response = get(url)
    response.raise_for_status()
    data = response.json()
    return data


def request_flickr(keywords) -> str:
    """
    Returns the HTML of a Flickr search
    """
    search_url = FLICKR_URL.format(keywords)
    result = get(search_url)
    html = result.text
    return html


def extract_urls(images):
    """
    Creates proper URLs from the regex matches
    """
    links = findall("(live.staticflickr.com/\S+.jpg)", str(images))
    formatted_urls = ["https://" + link for link in links]
    return formatted_urls


def scrape_flickr(keywords) -> List[str]:
    """
    Creates a list of image links from a Flickr search
    """
    html = request_flickr(keywords)
    soup = BeautifulSoup(html, features="html.parser")
    images = soup.find_all(
        "div", class_="view photo-list-photo-view requiredToShowOnServer awake",
    )
    links = extract_urls(images)
    return links