Creating an Online Repository of Searchable Scratch Forum Data (Process + Results)

Challenge


Currently the Scratch Forums does not have a functional search bar that permits users to filter through the metadata associated with the different threads.

"Search 'Results' Appearing as a Blank Table"
Users may be interested in such a feature in order to find historical tutorials/helpful posts on certain subjects, examine the contributions of other Scratchers, and find inspiration for their own projects. This project aims to provide a means for individuals to accomplish this task easily and effectively.

Process

The data from the forums is imported as raw HTML and processed via the beautifulsoup library.  Once the relevant information is extracted, it is placed into a sqlite3 database. This was published to a heroku web applet via the datasette JavaScript Library.

Results


The final product is located at this web address. The metadata is separated into tables representing all thirty-six public forums. You can search each of the columns with a variety of boolean operators and download processed data in a JSON or CSV file format.

Future Steps 

  • Updating Script - figure out a way to dynamically update the web page automatically on a regular basis. 
  • No manual extraction of data - currently the script requires that you supply the number of pages for a given forum. This process could be automated within the program in the future. 

Code - Python


# coding=utf-8

"""
Definition: Scratch Discussion Forum Database Generator
"""

from bs4 import BeautifulSoup

import requests

import sqlite3

def extractData(numberOfPages,forumID,conn,c,forum):
    
    for j in range(1,numberOfPages+1):
        
        while True:  
            r  = requests.get("https://scratch.mit.edu/discuss/" + str(forumID) + "/?page=" + str(j))
            if (r.status_code == 200):
                break
            
        data = r.text
            
        soup = BeautifulSoup(data, 'lxml')             
        
        soup = soup.find(id="pagewrapper")
    
        soup = soup.find(id = "content")
    
        soup = soup.find(id = "vf")

        soup = soup.div
         
        soup = soup.find("div", {"class": "box-content"})
           
        soup = soup.contents[1]
    
        soup = soup.contents[3]
    
        while (soup.contents.count(u'\n')): 
            soup.contents.remove(u'\n')
    
        for i in range(0,len(soup.contents)):
            newSoup = soup.contents[i]
            if (newSoup.find("div", {"class" : "isticky"}) == None): # Q: Is the topic stickied?
                stickied = "No"
            else:
                stickied = "Yes"
            if (newSoup.find("div", {"class": "iclosed"}) == None): # Q: Is the topic open or closed?
                closed = "No"
            else:
                closed = "Yes"
            latestTimeStamp = newSoup.find("td", {"class": "tcr"}).a.string #Latest Post Timestamp
            if ("Today" in latestTimeStamp):
                latestTimeStamp = latestTimeStamp.replace("Today","Dec. 23, 2019") #replace with today's date
            if ("Yesterday" in latestTimeStamp):
                latestTimeStamp = latestTimeStamp.replace("Yesterday","Dec. 22, 2019") #replace with yesterday's date
            topicTitle = newSoup.find("h3", {"class": "topic_isread"}).string #Title of the Topic 
            topicAuthor = newSoup.find("span", {"class": "byuser"}).string[3:] #Author of the Topic
            topicLink = "https://scratch.mit.edu" + newSoup.find("h3", {"class": "topic_isread"}).a['href'] # Link to the topic
            numberOfReplies = newSoup.find("td", {"class": "tc2"}).string # Number of Replies
            numberOfViews = newSoup.find("td", {"class": "tc3"}).string # Number of Views
            latestPostLink = "https://scratch.mit.edu" + newSoup.find("td", {"class": "tcr"}).a['href'] #Latest Post Link
            latestPostAuthor = newSoup.find("td", {"class": "tcr"}).find("span", {"class":"byuser"}).string[3:] #Latest Post Author
            #insert = "(" + "'" + stickied + "'" + "," + "'" + closed + "'" + "," + "'" + topicTitle + "'" + "," + "'" + topicAuthor + "'" + "," + "'" + topicLink + "'" + "," + numberOfReplies + "," + numberOfViews + "," + "'" + latestPostLink + "'" + "," + "'" + latestTimeStamp + "'" + "," + "'" + latestPostAuthor + "'" + ")" 
            c.execute("INSERT INTO " + forum + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",(stickied, closed, topicTitle, topicAuthor, topicLink, numberOfReplies, numberOfViews, latestPostLink, latestTimeStamp, latestPostAuthor))            
            
        conn.commit()
        
        print(j)

def main(): 
    
    conn = sqlite3.connect('scratchForumPosts.db')
    
    c = conn.cursor()
    
    c.execute('''CREATE TABLE announcements(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 
    
    # FORMAT: (Number of Pages, Forum ID, SQLite Database Var., SQLite Database Cursor Var., Forum Database Storage Table Name)
    
    extractData(11,5,conn,c,"announcements") # Announcements Forum
    
    c.execute('''CREATE TABLE newScratchers(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(316,6,conn,c,"newScratchers") # New Scratchers Forum
    
    c.execute('''CREATE TABLE helpWithScripts(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(1913,7,conn,c,"helpWithScripts") # Help With Scripts Forum

    c.execute('''CREATE TABLE showAndTell(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(2458,8,conn,c,"showAndTell") # Show and Tell Forum
    
    c.execute('''CREATE TABLE projectIdeas(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(325,9,conn,c,"projectIdeas") # Project Ideas Forum
    
    c.execute('''CREATE TABLE collaboration(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(612,10,conn,c,"collaboration") # Collaboration Forum
    
    c.execute('''CREATE TABLE requests(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(571,11,conn,c,"requests") # Requests Forum
    
    c.execute('''CREATE TABLE questionsAboutScratch(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(1199,4,conn,c,"questionsAboutScratch") # Questions About Scratch Forum
    
    c.execute('''CREATE TABLE suggestions(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(943,1,conn,c,"suggestions") # Suggestions Forum
    
    c.execute('''CREATE TABLE bugsAndGlitches(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''')

    extractData(1023,3,conn,c,"bugsAndGlitches") # Bug and Glitches Forum
    
    c.execute('''CREATE TABLE advancedTopics(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(185,31,conn,c,"advancedTopics") # Advanced Topics Forum
    
    c.execute('''CREATE TABLE connectingToThePhysicalWorld(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(28,32,conn,c,"connectingToThePhysicalWorld") # Connecting to the Physical World Forum
    
    c.execute('''CREATE TABLE developingScratchExtensions(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''')  

    extractData(16,48,conn,c,"developingScratchExtensions") # Developing Scratch Extensions Forum
    
    c.execute('''CREATE TABLE openSourceProjects(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(7,49,conn,c,"openSourceProjects") # Open Source Projects Forum
    
    c.execute('''CREATE TABLE thingsIAmMakingAndCreating(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(284,29,conn,c,"thingsIAmMakingAndCreating") # Things I'm Making and Creating Forum
    
    c.execute('''CREATE TABLE thingsIAmReadingAndPlaying(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(385,30,conn,c,"thingsIAmReadingAndPlaying") # Things I'm Reading and Playing Forum
    
    c.execute('''CREATE TABLE africa(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(1,55,conn,c,"africa") # Africa Forum
    
    c.execute('''CREATE TABLE bahasaIndonesia(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(2,36,conn,c,"bahasaIndonesia") # Bahasa Indonesia Forum
    
    c.execute('''CREATE TABLE català(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''')  

    extractData(2,33,conn,c,"català") # Català Forum
    
    c.execute('''CREATE TABLE deutsch(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(48,13,conn,c,"deutsch") # Deutsch Forum
    
    c.execute('''CREATE TABLE ελληνικά(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(7,26,conn,c,"ελληνικά") # Ελληνικά Forum
    
    c.execute('''CREATE TABLE español(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(45,14,conn,c,"español") # Español Forum
    
    c.execute('''CREATE TABLE français(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(152,15,conn,c,"français") # Français Forum
    
    c.execute('''CREATE TABLE עברית(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(3,22,conn,c,"עברית") # Hebrew Forum 
        
    c.execute('''CREATE TABLE 한국어(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(81,23,conn,c,"한국어") # 한국어 Forum
    
    c.execute('''CREATE TABLE italiano(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(35,21,conn,c,"italiano") # Italiano Forum
    
    c.execute('''CREATE TABLE nederlands(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(15,19,conn,c,"nederlands") # Nederlands Forum
    
    c.execute('''CREATE TABLE 日本語(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(117,18,conn,c,"日本語") # 日本語 Forum
    
    c.execute('''CREATE TABLE norsk(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(1,24,conn,c,"norsk") # Norsk Forum
    
    c.execute('''CREATE TABLE polski(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(56,17,conn,c,"polski") # Polski Forum
    
    c.execute('''CREATE TABLE português(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(19,20,conn,c,"português") # Português Forum
    
    c.execute('''CREATE TABLE Pусский(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(37,27,conn,c,"Pусский") # Pусский Forum
    
    c.execute('''CREATE TABLE türkçe(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(25,25,conn,c,"türkçe") # Türkçe Forum
    
    c.execute('''CREATE TABLE 中文(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(62,16,conn,c,"中文") # 中文 Forum
    
    c.execute('''CREATE TABLE otherLanguages(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(14,34,conn,c,"otherLanguages") # Other Languages Forum
    
    c.execute('''CREATE TABLE translatingScratch(Stickied text, Closed text, Title text, Author text, Link text, Replies real, Views real, LatestPostLink text, LatestPostTimestamp text, LatestPostAuthor text)''') 

    extractData(10,28,conn,c,"translatingScratch") # Translating Scratch Forum
    
    conn.close()
    
main()

Code - Command Line


datasette publish heroku scratchForumPosts.db -n scratchdiscussionforums

Syntax-highlighted text box generated with hilite.me.

No comments:

Post a Comment