master-thesis/talk/pics/original/scrape.py
2020-10-16 18:06:04 +02:00

36 lines
1.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import os
import re
import sys
import requests
from bs4 import BeautifulSoup
import IPython
from os import path
if __name__ != '__main__':
raise ImportError('This is a command-line script and not supposed to be imported.')
pic_ids = [ re.match(r'.*-([0-9a-zA-Z-]{11})-unsplash\.jpg$', fn) for fn in os.listdir() ]
pic_ids = [ match.group(1) for match in pic_ids if match ]
for id in pic_ids:
try:
res = requests.get(f'https://unsplash.com/photos/{id}')
soup = BeautifulSoup(res.text, features='lxml')
title = soup.find('title').text
match = re.match(r'(.*) photo Free (.*)Image on Unsplash', title)
if match:
title, category = match.groups()
else:
match = re.match(r'Free (.*)Image on Unsplash', title)
category, = match.groups()
alts = [ img['alt'] for img in [ a.findChild('img') for a in soup.find_all('a') if a['href'].startswith('/@') ] if img ]
name = re.match("Go to (.*)'s profile", alts[0]).group(1)
print(f'{name}: {title if title else category.strip()}')
except:
print(id, file=sys.stderr)