Move web scraping logic into data_request

This commit is contained in:
coolneng 2020-06-14 21:29:42 +02:00
parent b74ceb05c8
commit dd7f1bab8d
Signed by: coolneng
GPG Key ID: 9893DA236405AF57
2 changed files with 28 additions and 26 deletions

View File

@ -1,6 +1,10 @@
from re import findall
from typing import List
from bs4 import BeautifulSoup
from requests import get
from constants import URL
from constants import FLICKR_URL, URL
def format_url(dataset) -> str:
@ -21,3 +25,26 @@ def request_dataset(dataset):
response.raise_for_status()
data = response.json()
return data
def request_flickr(keywords) -> str:
"""
Returns the HTML of a Flickr search
"""
search_url = FLICKR_URL.format(keywords)
result = get(search_url)
html = result.text
return html
def scrap_flickr(keywords) -> List[str]:
"""
Creates a list of image links from a Flickr search
"""
html = request_flickr(keywords)
soup = BeautifulSoup(html, features="html.parser")
images = soup.find_all(
"div", class_="view photo-list-photo-view requiredToShowOnServer awake",
)
image_links = findall("(live.staticflickr.com/\S+.jpg)", str(images))
return image_links

View File

@ -1,25 +0,0 @@
from bs4 import BeautifulSoup
from requests import get
from constants import FLICKR_URL
from re import findall
from typing import List
def request_flickr(keywords) -> str:
search_url = FLICKR_URL.format(keywords)
result = get(search_url)
html = result.text
return html
def scrap_flickr(keywords) -> List[str]:
html = request_flickr(keywords)
soup = BeautifulSoup(html, features="html.parser")
images = soup.find_all(
"div", class_="view photo-list-photo-view requiredToShowOnServer awake",
)
image_links = findall("(live.staticflickr.com/\S+.jpg)", str(images))
return image_links
scrap_flickr("paris")