From 58c0f4897dfcdcdd8f1301f41af3a4e6e57c35e7 Mon Sep 17 00:00:00 2001 From: basset Date: Sun, 14 Jun 2020 18:33:05 +0200 Subject: [PATCH 1/2] =?UTF-8?q?Testing=20del=20scraping=20de=20la=20fuente?= =?UTF-8?q?=20de=20im=C3=A1genes.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/pruebascraping.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 app/pruebascraping.py diff --git a/app/pruebascraping.py b/app/pruebascraping.py new file mode 100644 index 0000000..0c2b75d --- /dev/null +++ b/app/pruebascraping.py @@ -0,0 +1,20 @@ +from bs4 import BeautifulSoup +import urllib.request +import re + + +def request_Flickr(keywords): + datos=urllib.request.urlopen("https://commons.wikimedia.org/w/index.php?search={keywords}&title=Special%3ASearch&go=Go&ns0=1&ns6=1&ns12=1&ns14=1&ns100=1&ns106=1").read().decode() + + return datos; + + +def scrap_Flickr(datos): + soup=BeautifulSoup(datos, features="lxml") + tag=soup.find("table", class_="searchResultImage") + images=tag.find_all("a", class_="image") + for image in images: + print(image["href"]) + + +scrap_Flickr(request_Flickr("paris")) From 23dea062e575674fd850aade99a18a1d30aaaa9f Mon Sep 17 00:00:00 2001 From: coolneng Date: Sun, 14 Jun 2020 21:24:27 +0200 Subject: [PATCH 2/2] Scrape Flickr images --- app/pruebascraping.py | 31 ++++++++++++++++++------------- constants.py | 1 + 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/app/pruebascraping.py b/app/pruebascraping.py index 0c2b75d..4d396ee 100644 --- a/app/pruebascraping.py +++ b/app/pruebascraping.py @@ -1,20 +1,25 @@ from bs4 import BeautifulSoup -import urllib.request -import re +from requests import get +from constants import FLICKR_URL +from re import findall +from typing import List -def request_Flickr(keywords): - datos=urllib.request.urlopen("https://commons.wikimedia.org/w/index.php?search={keywords}&title=Special%3ASearch&go=Go&ns0=1&ns6=1&ns12=1&ns14=1&ns100=1&ns106=1").read().decode() - - return datos; +def request_flickr(keywords) -> str: + search_url = FLICKR_URL.format(keywords) + result = get(search_url) + html = result.text + return html -def scrap_Flickr(datos): - soup=BeautifulSoup(datos, features="lxml") - tag=soup.find("table", class_="searchResultImage") - images=tag.find_all("a", class_="image") - for image in images: - print(image["href"]) +def scrap_flickr(keywords) -> List[str]: + html = request_flickr(keywords) + soup = BeautifulSoup(html, features="html.parser") + images = soup.find_all( + "div", class_="view photo-list-photo-view requiredToShowOnServer awake", + ) + image_links = findall("(live.staticflickr.com/\S+.jpg)", str(images)) + return image_links -scrap_Flickr(request_Flickr("paris")) +scrap_flickr("paris") diff --git a/constants.py b/constants.py index 8ae10eb..31ee60a 100644 --- a/constants.py +++ b/constants.py @@ -5,6 +5,7 @@ DATASETS = [ "deconfinement-rues-amenagees-pour-pietons", ] URL = "https://opendata.paris.fr/api/records/1.0/search/?dataset={}&q=&rows=-1" +FLICKR_URL = "https://www.flickr.com/search/?text={}" COLUMNS = { "deconfinement-pistes-cyclables-temporaires": [ "fields.geo_shape.coordinates",