From 23dea062e575674fd850aade99a18a1d30aaaa9f Mon Sep 17 00:00:00 2001 From: coolneng Date: Sun, 14 Jun 2020 21:24:27 +0200 Subject: [PATCH] Scrape Flickr images --- app/pruebascraping.py | 31 ++++++++++++++++++------------- constants.py | 1 + 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/app/pruebascraping.py b/app/pruebascraping.py index 0c2b75d..4d396ee 100644 --- a/app/pruebascraping.py +++ b/app/pruebascraping.py @@ -1,20 +1,25 @@ from bs4 import BeautifulSoup -import urllib.request -import re +from requests import get +from constants import FLICKR_URL +from re import findall +from typing import List -def request_Flickr(keywords): - datos=urllib.request.urlopen("https://commons.wikimedia.org/w/index.php?search={keywords}&title=Special%3ASearch&go=Go&ns0=1&ns6=1&ns12=1&ns14=1&ns100=1&ns106=1").read().decode() - - return datos; +def request_flickr(keywords) -> str: + search_url = FLICKR_URL.format(keywords) + result = get(search_url) + html = result.text + return html -def scrap_Flickr(datos): - soup=BeautifulSoup(datos, features="lxml") - tag=soup.find("table", class_="searchResultImage") - images=tag.find_all("a", class_="image") - for image in images: - print(image["href"]) +def scrap_flickr(keywords) -> List[str]: + html = request_flickr(keywords) + soup = BeautifulSoup(html, features="html.parser") + images = soup.find_all( + "div", class_="view photo-list-photo-view requiredToShowOnServer awake", + ) + image_links = findall("(live.staticflickr.com/\S+.jpg)", str(images)) + return image_links -scrap_Flickr(request_Flickr("paris")) +scrap_flickr("paris") diff --git a/constants.py b/constants.py index 8ae10eb..31ee60a 100644 --- a/constants.py +++ b/constants.py @@ -5,6 +5,7 @@ DATASETS = [ "deconfinement-rues-amenagees-pour-pietons", ] URL = "https://opendata.paris.fr/api/records/1.0/search/?dataset={}&q=&rows=-1" +FLICKR_URL = "https://www.flickr.com/search/?text={}" COLUMNS = { "deconfinement-pistes-cyclables-temporaires": [ "fields.geo_shape.coordinates",