"""
Crawl a site and extract all unique URLs for html pages.
This script takes one argument: the url to the site to crawl.
If you want to store the ouput, pipe it to a file.
Usage example (output to console):
python crawlsite.py http://www.mysite.com
Usage example (output to file in Windows):
python crawlsite.py http://www.mysite.com > mylinks.txt
This script was written in a haste. Please report errors to
pete@standards-schmandards.com
This script uses the htmldata library by Connelly Barnes. Please
make sure it is available in the same folder.
"""
__author__ = 'Peter Krantz'
__version__ = '0.1'
__date__ = '2005/04/01'
import urllib2
import htmldata
import httplib
import sys
import urlparse
import codecs
import datetime
useragentFirefox = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6) Gecko/20050223 Firefox/1.0.1"
useragentIE6 = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;)"
useragentSelf = "Sitecrawler " + __version__ + " " + __date__ + " by " + __author__
skippedProtocols = ("javascript", "mailto", "ftp", "gopher")
validContentTypes = ("text/html", "application/xhtml+xml")
url = sys.argv[1]
urlparts = urlparse.urlsplit(url)
rootUrl = urlparts[0] + "://" + urlparts[1]
parsedurls = []
def isParsable(contentType):
result = False
for validContentType in validContentTypes:
if validContentType in contentType:
result = True
break
return result
def stripFragment(url):
urlparts = urlparse.urlsplit(url)
protocol = urlparts[0]
server = urlparts[1]
path = urlparts[2]
query = urlparts[3]
fragment = urlparts[4]
return protocol + "://" + server + path + query
def addUrlToHistory(url):
global parsedurls
urlparts = urlparse.urlsplit(url)
protocol = urlparts[0]
server = urlparts[1]
path = urlparts[2]
query = urlparts[3]
fragment = urlparts[4]
parsedurls.append(stripFragment(url))
def urlIsOk(url):
global rootUrl
global parsedurls
try:
urlparts = urlparse.urlsplit(url)
protocol = urlparts[0]
server = urlparts[1]
path = urlparts[2]
fragment = urlparts[4]
if protocol in skippedProtocols:
return (True, "unknown", 0)
if len(server) > 0:
if url.find(rootUrl) == -1:
return (False, "unknown", 0)
if len(fragment) > 0:
if stripFragment(url) in parsedurls:
return (False, "unknown", 0)
httpObj = httplib.HTTPConnection(server, 80)
httpObj.connect()
httpObj.putrequest('HEAD', path)
httpObj.putheader('Accept', '*/*')
httpObj.putheader('User-Agent', useragentSelf)
httpObj.endheaders()
response = httpObj.getresponse()
contentType = response.getheader("content-type", "unknown")
httpObj.close();
if response.status != 200:
if response.status == 301:
return urlIsOk(response.getheader("location"))
if response.status == 302:
return urlIsOk(response.getheader("location"))
else:
return (False, contentType, response.status)
else:
return (True, contentType, 200)
except Exception:
return (False, "unknown", 999)
def checkUrl(url):
global currentUrl
result = urlIsOk(url)
if result[0]:
if isParsable(result[1]):
return True
else:
return False
else:
return False
def getContent(url):
try:
contents = urllib2.urlopen(url).read()
return contents
except:
return ""
def printlinks(url, currentlevel):
global recurselimit
global pagetitle
global parsedurls
global currentUrl
if not (stripFragment(url) in parsedurls):
if checkUrl(url):
currentUrl = url
contents = getContent(url)
addUrlToHistory(url)
print url
links = htmldata.urlextract(contents, url)
for u in links:
printlinks(u.url, currentlevel)
printlinks(url, 0)