graphPaname/app/data_request.py

from re import findall
from typing import List

from bs4 import BeautifulSoup
from requests import get

from constants import FLICKR_URL, URL


def format_url(dataset) -> str:
    """
    Constructs the API's URL for the requested dataset
    """
    link = URL.format(dataset)
    return link


def request_dataset(dataset):
    """
    Fetches the requested dataset from opendata's API
    Raises an exception if there's an HTTP error
    """
    url = format_url(dataset)
    response = get(url)
    response.raise_for_status()
    data = response.json()
    return data


def request_flickr(keywords) -> str:
    """
    Returns the HTML of a Flickr search
    """
    search_url = FLICKR_URL.format(keywords)
    result = get(search_url)
    html = result.text
    return html


def extract_urls(images):
    """
    Creates proper URLs from the regex matches
    """
    links = findall("(live.staticflickr.com/\S+.jpg)", str(images))
    formatted_urls = ["https://" + link for link in links]
    return formatted_urls


def scrape_flickr(keywords) -> List[str]:
    """
    Creates a list of image links from a Flickr search
    """
    html = request_flickr(keywords)
    soup = BeautifulSoup(html, features="html.parser")
    images = soup.find_all(
        "div", class_="view photo-list-photo-view requiredToShowOnServer awake",
    )
    links = extract_urls(images)
    return links
Move web scraping logic into data_request 2020-06-14 21:29:42 +02:00			`from re import findall`
			`from typing import List`

			`from bs4 import BeautifulSoup`
Implement dataset request draft 2020-05-21 18:45:51 +02:00			`from requests import get`
Add dataframe column assertion 2020-06-12 19:21:50 +02:00
Move web scraping logic into data_request 2020-06-14 21:29:42 +02:00			`from constants import FLICKR_URL, URL`
Implement dataset request draft 2020-05-21 18:45:51 +02:00

Document all the functions 2020-06-05 13:48:47 +02:00			`def format_url(dataset) -> str:`
			`"""`
			`Constructs the API's URL for the requested dataset`
			`"""`
			`link = URL.format(dataset)`
			`return link`


			`def request_dataset(dataset):`
			`"""`
			`Fetches the requested dataset from opendata's API`
Add dataframe column assertion 2020-06-12 19:21:50 +02:00			`Raises an exception if there's an HTTP error`
Document all the functions 2020-06-05 13:48:47 +02:00			`"""`
			`url = format_url(dataset)`
Export JSON files in the data directory 2020-05-22 20:58:52 +02:00			`response = get(url)`
			`response.raise_for_status()`
			`data = response.json()`
Fetch JSON into variable instead of file 2020-06-13 21:58:17 +02:00			`return data`
Move web scraping logic into data_request 2020-06-14 21:29:42 +02:00

			`def request_flickr(keywords) -> str:`
			`"""`
			`Returns the HTML of a Flickr search`
			`"""`
			`search_url = FLICKR_URL.format(keywords)`
			`result = get(search_url)`
			`html = result.text`
			`return html`


Add photo visualization page 2020-06-14 21:47:31 +02:00			`def extract_urls(images):`
			`"""`
			`Creates proper URLs from the regex matches`
			`"""`
			`links = findall("(live.staticflickr.com/\S+.jpg)", str(images))`
			`formatted_urls = ["https://" + link for link in links]`
			`return formatted_urls`


			`def scrape_flickr(keywords) -> List[str]:`
Move web scraping logic into data_request 2020-06-14 21:29:42 +02:00			`"""`
			`Creates a list of image links from a Flickr search`
			`"""`
			`html = request_flickr(keywords)`
			`soup = BeautifulSoup(html, features="html.parser")`
			`images = soup.find_all(`
			`"div", class_="view photo-list-photo-view requiredToShowOnServer awake",`
			`)`
Add photo visualization page 2020-06-14 21:47:31 +02:00			`links = extract_urls(images)`
			`return links`