graphPaname/app/data_request.py

60 lines
1.4 KiB
Python
Raw Permalink Normal View History

from re import findall
from typing import List
from bs4 import BeautifulSoup
2020-05-21 18:45:51 +02:00
from requests import get
2020-06-12 19:21:50 +02:00
2020-06-15 01:19:13 +02:00
from constants import FLICKR_URL, DATASET_URL
2020-05-21 18:45:51 +02:00
2020-06-05 13:48:47 +02:00
def format_url(dataset) -> str:
"""
Constructs the API's URL for the requested dataset
"""
2020-06-15 01:19:13 +02:00
link = DATASET_URL.format(dataset)
2020-06-05 13:48:47 +02:00
return link
def request_dataset(dataset):
"""
Fetches the requested dataset from opendata's API
2020-06-12 19:21:50 +02:00
Raises an exception if there's an HTTP error
2020-06-05 13:48:47 +02:00
"""
url = format_url(dataset)
response = get(url)
response.raise_for_status()
data = response.json()
return data
def request_flickr(keywords) -> str:
"""
Returns the HTML of a Flickr search
"""
search_url = FLICKR_URL.format(keywords)
result = get(search_url)
html = result.text
return html
2020-06-14 21:47:31 +02:00
def extract_urls(images):
"""
Creates proper URLs from the regex matches
"""
links = findall("(live.staticflickr.com/\S+.jpg)", str(images))
formatted_urls = ["https://" + link for link in links]
return formatted_urls
def scrape_flickr(keywords) -> List[str]:
"""
Creates a list of image links from a Flickr search
"""
html = request_flickr(keywords)
soup = BeautifulSoup(html, features="html.parser")
images = soup.find_all(
"div", class_="view photo-list-photo-view requiredToShowOnServer awake",
)
2020-06-14 21:47:31 +02:00
links = extract_urls(images)
return links