I found this script a while back. I edited it and have been playing with it for a few months. It is no longer working with the new version, but with some clever foo, it'll run again.

I have a lot commented out. PASTE. FRUBAR.NET went down about a week after I started scraping.



#!/usr/bin/python

##############################################################
# author: Thomas Dziedzic
# rebuilt by : Sw1tCh : to include more sites and fix shit
# description: downloads all recent text on pastebin.com /
# pastie.frubar.net / pastebin.ca
##############################################################



from HTMLParser import HTMLParser
import urllib, string, os, sys


##############################################################
########## PASTEBIN.COM
###############################################################


print "Beginning Scraping of data from Pastebin.com now..."

create_dir_base = "mkdir /root/Working/Pastebin_Scraping/saved"
os.popen(create_dir_base)

url = "http://www.pastebin.com/"
urldl = "http://pastebin.com/pastebin.php?dl="
path = "pb&j"

# retrieves html content
f = urllib.urlopen(url)
s = f.read()
f.close()

# parse html
links = []
class Parser(HTMLParser):
encounteredRecentPosts = False
linkcount = 0

def handle_starttag(self, tag, attrs):
if tag == "a" and self.encounteredRecentPosts and self.linkcount < 8:
links.append(attrs[0][1][20:])
self.linkcount += 1

def handle_data(self, data):
if data == "Recent Posts":
self.encounteredRecentPosts = True

parser = Parser()
parser.feed(s)
parser.close()

# download pastebin files into folder
for link in links:
if len(link) != 8:
#print "Ignored Link [If this goes over 2, then there might be a problem]"
#print "Failed Link: " + link
# if the links are all being ignored, the length count may need to be changed to a higher value
print ""
else:
f = urllib.urlopen("http://pastebin.com/download.php?i=" + link)
print "Dumping Files " + link
s = f.read()
f.close()
link = link.replace('/','')
link = "pastebin.com_" + link
f = open( '/root/Working/Pastebin_Scraping/saved/' + link, 'w')
f.write(s)
f.close()




##############################################################
########## PASTEBIN.ca
##############################################################


#print "Beginning Scraping of data from Pastebin.ca now..."
#
#url = "http://pastebin.ca/"
#urldl = "http://pastebin.ca/raw"
#path = "pb&j"

# retrieves html content
#f = urllib.urlopen(url)
#s = f.read()
#f.close()

# parse html
#links = []
#class Parser(HTMLParser):
# encounteredRecentPosts = False
# linkcount = 0
#
# def handle_starttag(self, tag, attrs):
# if tag == "a" and self.encounteredRecentPosts and self.linkcount < 10:
# links.append(attrs[0][1])
# self.linkcount += 1

# def handle_data(self, data):
# if data == "Recent Posts":
# self.encounteredRecentPosts = True



#parser = Parser()
#parser.feed(s)
#parser.close()

# download pastebin files into folder
#for link in links:
# if len(link) != 8:
# print "Ignored Link [If this goes over 2, then there might be a problem]"
# print "Failed Link: " + link
# if the links are all being ignored, the length count may need to be changed to a higher value
# print "Problem - " + link
# else:
# f = urllib.urlopen("http://pastebin.ca/raw" + link)
# print "Dumping Files " + link
# s = f.read()
# f.close()
# link = link.replace('/','')
# link = "pastebin.ca_" + link
# f = open( '/root/Working/Pastebin_Scraping/saved/' + link, 'w')
# f.write(s)
# f.close()


##############################################################
########## PASTE. FRUBAR.NET
##############################################################



#print "Beginning Scraping of data from paste.frubar.net now..."

#url = "http://paste.frubar.net/"
#urldl = "http://paste.frubar.net/Download"
#path = "pb&j"

## retrieves html content
#f = urllib.urlopen(url)
#s = f.read()
#.close()

# parse html
#links = []
#class Parser(HTMLParser):
# encounteredRecentPosts = False
# linkcount = 0

# def handle_starttag(self, tag, attrs):
# if tag == "a" and self.encounteredRecentPosts and self.linkcount < 10:
# links.append(attrs[0][1])
# self.linkcount += 1

# def handle_data(self, data):
# if data == "Recent Posts":
# self.encounteredRecentPosts = True



#parser = Parser()
#parser.feed(s)
#parser.close()

# download pastebin files into folder
#for link in links:
## link = link.replace('http://paste.frubar.net/','')
# if len(link) != 5:
# print "Ignored Link [If this goes over 2, then there might be a problem]"
# print "Failed Link: " + link
# # if the links are all being ignored, the length count may need to be changed to a higher value
# print "Problem - " + link
# else:
# f = urllib.urlopen("http://paste.frubar.net/Download" + link)
# print "Dumping Files " + link
# s = f.read()
# f.close()
# link = "paste.frubar.net_" + link
# f = open( '/root/Working/Pastebin_Scraping/saved/' + link, 'w')
# f.write(s)
# f.close()
0

Add a comment

    Loading