Move web scraping logic into data_request

2020-06-14 21:29:42 +02:00 · 2020-06-14 21:29:42 +02:00 · dd7f1bab8d
parent b74ceb05c8
commit dd7f1bab8d
2 changed files with 28 additions and 26 deletions
--- a/app/data_request.py
+++ b/app/data_request.py
@ -1,6 +1,10 @@
 from re import findall
 from typing import List
 from bs4 import BeautifulSoup
 from requests import get
-from constants import URL
+from constants import FLICKR_URL, URL
 def format_url(dataset) -> str:
@ -21,3 +25,26 @@ def request_dataset(dataset):
    response.raise_for_status()
    data = response.json()
    return data
 def request_flickr(keywords) -> str:
    """
    Returns the HTML of a Flickr search
    """
    search_url = FLICKR_URL.format(keywords)
    result = get(search_url)
    html = result.text
    return html
 def scrap_flickr(keywords) -> List[str]:
    """
    Creates a list of image links from a Flickr search
    """
    html = request_flickr(keywords)
    soup = BeautifulSoup(html, features="html.parser")
    images = soup.find_all(
        "div", class_="view photo-list-photo-view requiredToShowOnServer awake",
    )
    image_links = findall("(live.staticflickr.com/\S+.jpg)", str(images))
    return image_links
--- a/app/pruebascraping.py
+++ b/app/pruebascraping.py
@ -1,25 +0,0 @@
 from bs4 import BeautifulSoup
 from requests import get
 from constants import FLICKR_URL
 from re import findall
 from typing import List
 def request_flickr(keywords) -> str:
    search_url = FLICKR_URL.format(keywords)
    result = get(search_url)
    html = result.text
    return html
 def scrap_flickr(keywords) -> List[str]:
    html = request_flickr(keywords)
    soup = BeautifulSoup(html, features="html.parser")
    images = soup.find_all(
        "div", class_="view photo-list-photo-view requiredToShowOnServer awake",
    )
    image_links = findall("(live.staticflickr.com/\S+.jpg)", str(images))
    return image_links
 scrap_flickr("paris")