Data Scraping & SQL Database: Pictures
I created a program that uses the Reddit API to legally scrape images and videos that can be used for model training in computer vision applications.
It is fully automated to scrape the links of the top 5 pictures and videos every day from my favourite subreddits.
It then adds these links along with detailed information (including source credits) to a SQL database using MySQL and Python.
The results are extracted from the created database using SQL queries such as selects & joins and where & order by clauses.
They are then displayed as posts on a HTML page with autoplay and embedded lazy loading to lower bandwidth usage.
Code
I have updated the code to use a SQL database using MySQL and Python.
The SQL code was implemented in the following way:
import mysql.connector as mysql
Connect to the database using your username and password
db = mysql.connect(
host = "localhost",
user = "",
passwd = ""
)
Create a cursor that allows execution of SQL statements
cursor = db.cursor()
Create your reddit database
cursor.execute("CREATE DATABASE reddit")
Connect to your new database
db = mysql.connect(
host = "localhost",
user = "",
passwd = "",
database = "reddit"
)
cursor = db.cursor()
Create tables for posts and files with file names as primary keys because each file is unique
cursor.execute("""CREATE TABLE posts (file_name VARCHAR(255) NOT NULL PRIMARY KEY,
source_url VARCHAR(255), subreddit_name VARCHAR(255), post_name VARCHAR(255),
date_posted DATETIME)""")
cursor.execute("""CREATE TABLE files (file_name VARCHAR(255) NOT NULL PRIMARY KEY,
file_url VARCHAR(255), has_audio BOOLEAN, is_video BOOLEAN)""")
Execute SQL queries to insert values associated to each column into your tables (These commands replace the dataframe append in the main code)
query = "INSERT INTO posts (file_name, source_url, subreddit_name, post_name, date_posted)
VALUES (%s, %s, %s, %s, %s)"
values = (file_name, source_url, subreddit_name, submission.title, date_posted)
cursor.execute(query, values)
query = "INSERT INTO files (file_name, file_url, has_audio, is_video) VALUES (%s, %s, %s, %s)"
values = (file_name, url, file_has_audio(file_name, url), file_is_video(file_name))
cursor.execute(query, values)
Update the database after executing the queries above
db.commit()
Can check if a file exists in the database by querying the database using SELECT
cursor.execute("SELECT file_name FROM files")
names = cursor.fetchall()
if your_file in names: print("File exists")
You can now use the updated database to build your HTML code.
SELECT the DISTINCT subreddits in the database
cursor.execute("SELECT DISTINCT subreddit_name FROM posts")
subreddit_names = cursor.fetchall()
Build the HTML body containing code to display text, images and videos on the webpage
html_body = {}
Loop through subreddits individually, and join the tables using queries to get the file names
for subreddit_name, in subreddit_names:
Remove html links from the database that no longer work
if open_url(file_url) is None:
cursor.execute("DELETE FROM posts WHERE file_name = '%s'" % (file_name))
cursor.execute("DELETE FROM files WHERE file_name = '%s'" % (file_name))
db.commit()
continue
Use SELECT * to obtain all columns
INNER JOIN the two tables ON file_name to get the union i.e. the rows where file_name exists in both tables (In this example there are no missing file_name values)
Use WHERE to filter for the subreddit you are looking at
ORDER BY the date the file was uploaded onto reddit in descending order. (So that newer links will show at the top of the web page)
(You could alternatively also use a UNION query of the two tables)
cursor.execute("SELECT * from posts
INNER JOIN files
ON posts.file_name = files.file_name
WHERE posts.subreddit_name = '%s'
ORDER BY posts.date_posted DESC" % (subreddit_name))
subreddit_files = cursor.fetchall()
html_code = []
# loop through each row which is associated to a different file. Each row is returned as a tuple
for file_name, source_url, _, post_name, date_posted, _, file_url, has_audio, is_video in subreddit_files:
# assemble your code for the file using the database and append it to the list
html_code.append(your_html_code_for_each_file_using_database)
# join the HTML code together for each subreddit
html_body.update({subreddit_name: "".join(html_code)})
Close the database
db.close()
If anything went wrong you can drop the database and start over
cursor.execute("DROP DATABASE reddit")
Main Code
The main code is below. This example also adds the name of the file into a .csv database.
Import required modules
import os
import praw
import requests
from bs4 import BeautifulSoup as bs
from moviepy.editor import VideoFileClip
import pandas as pd
from datetime import datetime
Load reddit instance using Reddit API log in details and the PRAW module
reddit = praw.Reddit(client_id="", client_secret="", password="", user_agent="", username="")
Choose subreddit
subreddit_name = "aww"
Open the database
database_loc = r"./reddit_database.csv"
# load file if it exists, or create it
if os.path.exists(database_loc):
reddit_df = pd.read_csv(database_loc)
else:
reddit_df = pd.DataFrame([], columns = ["file_name","subreddit_name","post_name",
"date_posted","has_audio","is_video","source_url","file_url"])
Get the top 5 hot posts in the subreddit
subreddit = reddit.subreddit(subreddit_name)
submissions = []
for submission in subreddit.hot(limit=5):
submissions.append(submission)
Get the file urls and add them to the database along with useful information
for i, submission in enumerate(submissions):
# convert time from float
date_posted = datetime.utcfromtimestamp(submission.created_utc)
source_url = submission.url
# find the direct link to the video or image
url = find_file_link(source_url)
if url is None: continue
# get file name
file_name = url.split('/')[-1]
# skip file if it already exists
if file_name in reddit_df['file_name'].tolist(): print("File exists:", url); continue
# load file & check that the url works
file = open_url(url)
if file is None: continue
# optional: save the file in chunks (if you don't want just the links)
# print("Saving file", i+1, "of", len(submissions), ":", url)
# save_dir = "./"
# save_file(save_dir, file_name, file)
# append to dataframe
reddit_df = reddit_df.append({'file_name': file_name, 'subreddit_name': subreddit_name,
'post_name': submission.title, 'date_posted': date_posted,
'has_audio': file_has_audio(file_name, url),
'is_video': file_is_video(file_name),
'source_url': source_url, 'file_url': url},
ignore_index=True)
# convert time to dataframe format
reddit_df.date_posted = pd.to_datetime(reddit_df.date_posted, format="%Y-%m-%d %H:%M:%S")
# save database to file
reddit_df.to_csv(database_loc, index = False, header=True)
Functions called above are defined here.
Convert the source url to a direct link to the file
def find_file_link(url):
file_extension = url.split('.')[-1]
valid_extension_list = ["jpg","jpeg","gif","png","mp4"]
if url.startswith("https://gfycat"):
soup = bs(requests.get(url).content)
tag = soup.findAll("source", {"type": "video/mp4"})[1]
url = tag.attrs['src']
elif url.startswith("https://v.redd.it/"):
# convert to video using vreddit and get link
url = "https://vreddit.cc/"+url.split('/')[-1]
source = requests.get(url).content # load link to fetch file
url = url+".mp4"
elif url.startswith("https://imgur.com"):
soup = bs(requests.get(url).content)
tags = soup.findAll("meta", {"property": "og:video"})
if not tags: # if image
tags = soup.findAll("meta", {"name": "twitter:image"})
url = tags[0].attrs["content"]
elif file_extension == "gifv": url = url[:-4]+"mp4"
elif file_extension not in valid_extension_list: return
# if not caught, it's already a valid extension :)
return url
Check if a file is a video
def file_is_video(file_name):
valid_video_extension_list = ["mp4"]
return file_name.split('.')[-1] in valid_video_extension_list
Check if a file has audio
def file_has_audio(file_name, url):
if file_is_video(file_name):
video = VideoFileClip(url)
if video is not None and video.audio is not None: return True
return False
Check if a file exists in folder hierarchy
def file_exists(File):
for root, dirs, files in os.walk('./'):
if File in files: return True
return False
Open a url link
def open_url(url):
try:
file = requests.get(url, stream=True)
file.raise_for_status()
return file
except requests.exceptions.HTTPError as errh:
("HTTP Error:", errh)
except requests.exceptions.ConnectionError as errc:
("Connection Error:", errc)
except requests.exceptions.Timeout as errt:
print ("Timeout Error:", errt)
return
except requests.exceptions.RequestException as err:
print ("Unexpected Error:", err)
return
Save a file locally in chunks
def save_file(save_dir, file_name, file):
with open(save_dir+file_name, 'wb') as fd:
for chunk in file.iter_content(chunk_size=128):
fd.write(chunk)