Currently the Scratch Forums does not have a functional search bar that permits users to filter through the metadata associated with the different threads.
"Search 'Results' Appearing as a Blank Table" |
Process
The data from the forums is imported as raw HTML and processed via the beautifulsoup library. Once the relevant information is extracted, it is placed into a sqlite3 database. This was published to a heroku web applet via the datasette JavaScript Library.
Results
The final product is located at this web address. The metadata is separated into tables representing all thirty-six public forums. You can search each of the columns with a variety of boolean operators and download processed data in a JSON or CSV file format.
Future Steps
- Updating Script - figure out a way to dynamically update the web page automatically on a regular basis.
- No manual extraction of data - currently the script requires that you supply the number of pages for a given forum. This process could be automated within the program in the future.
Code - Python
# coding=utf-8 """ Definition: Scratch Discussion Forum Database Generator """ from bs4 import BeautifulSoup import requests import sqlite3 def extractData(numberOfPages,forumID,conn,c,forum): for j in range(1,numberOfPages+1): while True: r = requests.get("https://scratch.mit.edu/discuss/" + str(forumID) + "/?page=" + str(j)) if (r.status_code == 200): break data = r.text soup = BeautifulSoup(data, 'lxml') soup = soup.find(id="pagewrapper") soup = soup.find(id = "content") soup = soup.find(id = "vf") soup = soup.div soup = soup.find("div", {"class": "box-content"}) soup = soup.contents[1] soup = soup.contents[3] while (soup.contents.count(u'\n')): soup.contents.remove(u'\n') for i in range(0,len(soup.contents)): newSoup = soup.contents[i] if (newSoup.find("div", {"class" : "isticky"}) == None): # Q: Is the topic stickied? stickied = "No" else: stickied = "Yes" if (newSoup.find("div", {"class": "iclosed"}) == None): # Q: Is the topic open or closed? closed = "No" else: closed = "Yes" latestTimeStamp = newSoup.find("td", {"class": "tcr"}).a.string #Latest Post Timestamp if ("Today" in latestTimeStamp): latestTimeStamp = latestTimeStamp.replace("Today","Dec. 23, 2019") #replace with today's date if ("Yesterday" in latestTimeStamp): latestTimeStamp = latestTimeStamp.replace("Yesterday","Dec. 22, 2019") #replace with yesterday's date topicTitle = newSoup.find("h3", {"class": "topic_isread"}).string #Title of the Topic topicAuthor = newSoup.find("span", {"class": "byuser"}).string[3:] #Author of the Topic topicLink = "https://scratch.mit.edu" + newSoup.find("h3", {"class": "topic_isread"}).a['href'] # Link to the topic numberOfReplies = newSoup.find("td", {"class": "tc2"}).string # Number of Replies numberOfViews = newSoup.find("td", {"class": "tc3"}).string # Number of Views latestPostLink = "https://scratch.mit.edu" + newSoup.find("td", {"class": "tcr"}).a['href'] #Latest Post Link latestPostAuthor = newSoup.find("td", {"class": "tcr"}).find("span", {"class":"byuser"}).string[3:] #Latest Post Author #insert = "(" + "'" + stickied + "'" + "," + "'" + closed + "'" + "," + "'" + topicTitle + "'" + "," + "'" + topicAuthor + "'" + "," + "'" + topicLink + "'" + "," + numberOfReplies + "," + numberOfViews + "," + "'" + latestPostLink + "'" + "," + "'" + latestTimeStamp + "'" + "," + "'" + latestPostAuthor + "'" + ")" c.execute("INSERT INTO " + forum + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",(stickied, closed, topicTitle, topicAuthor, topicLink, numberOfReplies, numberOfViews, latestPostLink, latestTimeStamp, latestPostAuthor)) conn.commit() print(j) def main(): conn = sqlite3.connect('scratchForumPosts.db') c = conn.cursor() c.execute('''CREATE TABLE announcements(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') # FORMAT: (Number of Pages, Forum ID, SQLite Database Var., SQLite Database Cursor Var., Forum Database Storage Table Name) extractData(11,5,conn,c,"announcements") # Announcements Forum c.execute('''CREATE TABLE newScratchers(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(316,6,conn,c,"newScratchers") # New Scratchers Forum c.execute('''CREATE TABLE helpWithScripts(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(1913,7,conn,c,"helpWithScripts") # Help With Scripts Forum c.execute('''CREATE TABLE showAndTell(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(2458,8,conn,c,"showAndTell") # Show and Tell Forum c.execute('''CREATE TABLE projectIdeas(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(325,9,conn,c,"projectIdeas") # Project Ideas Forum c.execute('''CREATE TABLE collaboration(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(612,10,conn,c,"collaboration") # Collaboration Forum c.execute('''CREATE TABLE requests(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(571,11,conn,c,"requests") # Requests Forum c.execute('''CREATE TABLE questionsAboutScratch(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(1199,4,conn,c,"questionsAboutScratch") # Questions About Scratch Forum c.execute('''CREATE TABLE suggestions(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(943,1,conn,c,"suggestions") # Suggestions Forum c.execute('''CREATE TABLE bugsAndGlitches(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(1023,3,conn,c,"bugsAndGlitches") # Bug and Glitches Forum c.execute('''CREATE TABLE advancedTopics(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(185,31,conn,c,"advancedTopics") # Advanced Topics Forum c.execute('''CREATE TABLE connectingToThePhysicalWorld(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(28,32,conn,c,"connectingToThePhysicalWorld") # Connecting to the Physical World Forum c.execute('''CREATE TABLE developingScratchExtensions(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(16,48,conn,c,"developingScratchExtensions") # Developing Scratch Extensions Forum c.execute('''CREATE TABLE openSourceProjects(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(7,49,conn,c,"openSourceProjects") # Open Source Projects Forum c.execute('''CREATE TABLE thingsIAmMakingAndCreating(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(284,29,conn,c,"thingsIAmMakingAndCreating") # Things I'm Making and Creating Forum c.execute('''CREATE TABLE thingsIAmReadingAndPlaying(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(385,30,conn,c,"thingsIAmReadingAndPlaying") # Things I'm Reading and Playing Forum c.execute('''CREATE TABLE africa(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(1,55,conn,c,"africa") # Africa Forum c.execute('''CREATE TABLE bahasaIndonesia(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(2,36,conn,c,"bahasaIndonesia") # Bahasa Indonesia Forum c.execute('''CREATE TABLE català(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(2,33,conn,c,"català") # Català Forum c.execute('''CREATE TABLE deutsch(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(48,13,conn,c,"deutsch") # Deutsch Forum c.execute('''CREATE TABLE ελληνικά(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(7,26,conn,c,"ελληνικά") # Ελληνικά Forum c.execute('''CREATE TABLE español(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(45,14,conn,c,"español") # Español Forum c.execute('''CREATE TABLE français(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(152,15,conn,c,"français") # Français Forum c.execute('''CREATE TABLE עברית(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(3,22,conn,c,"עברית") # Hebrew Forum c.execute('''CREATE TABLE 한국어(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(81,23,conn,c,"한국어") # 한국어 Forum c.execute('''CREATE TABLE italiano(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(35,21,conn,c,"italiano") # Italiano Forum c.execute('''CREATE TABLE nederlands(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(15,19,conn,c,"nederlands") # Nederlands Forum c.execute('''CREATE TABLE 日本語(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(117,18,conn,c,"日本語") # 日本語 Forum c.execute('''CREATE TABLE norsk(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(1,24,conn,c,"norsk") # Norsk Forum c.execute('''CREATE TABLE polski(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(56,17,conn,c,"polski") # Polski Forum c.execute('''CREATE TABLE português(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(19,20,conn,c,"português") # Português Forum c.execute('''CREATE TABLE Pусский(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(37,27,conn,c,"Pусский") # Pусский Forum c.execute('''CREATE TABLE türkçe(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(25,25,conn,c,"türkçe") # Türkçe Forum c.execute('''CREATE TABLE 中文(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(62,16,conn,c,"中文") # 中文 Forum c.execute('''CREATE TABLE otherLanguages(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(14,34,conn,c,"otherLanguages") # Other Languages Forum c.execute('''CREATE TABLE translatingScratch(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') extractData(10,28,conn,c,"translatingScratch") # Translating Scratch Forum conn.close() main()
Code - Command Line
datasette publish heroku scratchForumPosts.db -n scratchdiscussionforums
Syntax-highlighted text box generated with hilite.me.
No comments:
Post a Comment