Move web scraping logic into data_request
This commit is contained in:
parent
b74ceb05c8
commit
dd7f1bab8d
|
@ -1,6 +1,10 @@
|
||||||
|
from re import findall
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
from requests import get
|
from requests import get
|
||||||
|
|
||||||
from constants import URL
|
from constants import FLICKR_URL, URL
|
||||||
|
|
||||||
|
|
||||||
def format_url(dataset) -> str:
|
def format_url(dataset) -> str:
|
||||||
|
@ -21,3 +25,26 @@ def request_dataset(dataset):
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
data = response.json()
|
data = response.json()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def request_flickr(keywords) -> str:
|
||||||
|
"""
|
||||||
|
Returns the HTML of a Flickr search
|
||||||
|
"""
|
||||||
|
search_url = FLICKR_URL.format(keywords)
|
||||||
|
result = get(search_url)
|
||||||
|
html = result.text
|
||||||
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
def scrap_flickr(keywords) -> List[str]:
|
||||||
|
"""
|
||||||
|
Creates a list of image links from a Flickr search
|
||||||
|
"""
|
||||||
|
html = request_flickr(keywords)
|
||||||
|
soup = BeautifulSoup(html, features="html.parser")
|
||||||
|
images = soup.find_all(
|
||||||
|
"div", class_="view photo-list-photo-view requiredToShowOnServer awake",
|
||||||
|
)
|
||||||
|
image_links = findall("(live.staticflickr.com/\S+.jpg)", str(images))
|
||||||
|
return image_links
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from requests import get
|
|
||||||
from constants import FLICKR_URL
|
|
||||||
from re import findall
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
|
|
||||||
def request_flickr(keywords) -> str:
|
|
||||||
search_url = FLICKR_URL.format(keywords)
|
|
||||||
result = get(search_url)
|
|
||||||
html = result.text
|
|
||||||
return html
|
|
||||||
|
|
||||||
|
|
||||||
def scrap_flickr(keywords) -> List[str]:
|
|
||||||
html = request_flickr(keywords)
|
|
||||||
soup = BeautifulSoup(html, features="html.parser")
|
|
||||||
images = soup.find_all(
|
|
||||||
"div", class_="view photo-list-photo-view requiredToShowOnServer awake",
|
|
||||||
)
|
|
||||||
image_links = findall("(live.staticflickr.com/\S+.jpg)", str(images))
|
|
||||||
return image_links
|
|
||||||
|
|
||||||
|
|
||||||
scrap_flickr("paris")
|
|
Loading…
Reference in New Issue