crawlsite.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Crawl a site and extract all unique URLs for html pages.

This script takes one argument: the url to the site to crawl.
If you want to store the ouput, pipe it to a file.

Usage example (output to console):
python crawlsite.py http://www.mysite.com

Usage example (output to file in Windows):
python crawlsite.py http://www.mysite.com > mylinks.txt

This script was written in a haste. Please report errors to
pete@standards-schmandards.com

This script uses the htmldata library by Connelly Barnes. Please
make sure it is available in the same folder.

"""

__author__ = 'Peter Krantz'
__version__ = '0.1'
__date__ = '2005/04/01'

import urllib2
import htmldata
import httplib
import sys
import urlparse
import codecs
import datetime


#Setup some basic parameters
useragentFirefox = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6) Gecko/20050223 Firefox/1.0.1"
useragentIE6 = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;)"
useragentSelf = "Sitecrawler " + __version__ + " " + __date__ + " by " + __author__

skippedProtocols = ("javascript", "mailto", "ftp", "gopher")
validContentTypes = ("text/html", "application/xhtml+xml")


#get command line parameters
#Starting url
url = sys.argv[1]

#Get root url
urlparts = urlparse.urlsplit(url)
rootUrl = urlparts[0] + "://" + urlparts[1]

#List of parsed urls
parsedurls = []

#Is contenttype parsable?
def isParsable(contentType):
    result = False
    for validContentType in validContentTypes:
        if validContentType in contentType:
            result = True
            break

    return result


def stripFragment(url):
    urlparts = urlparse.urlsplit(url)

    protocol = urlparts[0]
    server = urlparts[1]
    path = urlparts[2]
    query = urlparts[3]
    fragment = urlparts[4]

    return protocol + "://" + server + path + query




def addUrlToHistory(url):
    global parsedurls

    urlparts = urlparse.urlsplit(url)

    protocol = urlparts[0]
    server = urlparts[1]
    path = urlparts[2]
    query = urlparts[3]
    fragment = urlparts[4]

    #Add url without fragment to list of parsed urls
    parsedurls.append(stripFragment(url))



#Check if URL exists. Returns status and content type.
def urlIsOk(url):
    global rootUrl
    global parsedurls

    try:

        #split the url to get the request item
        urlparts = urlparse.urlsplit(url)

        protocol = urlparts[0]
        server = urlparts[1]
        path = urlparts[2]
        fragment = urlparts[4]

        #Skip links where protocol is one of skippedProtocols
        if protocol in skippedProtocols:
            return (True, "unknown", 0)

        #Skip links to other sites
        if len(server) > 0:
            if url.find(rootUrl) == -1:
                return (False, "unknown", 0)

        #Skip same page links
        if len(fragment) > 0:
            if stripFragment(url) in parsedurls:
                return (False, "unknown", 0)

        #Check url header
        httpObj = httplib.HTTPConnection(server, 80)
        httpObj.connect()
        httpObj.putrequest('HEAD', path)
        httpObj.putheader('Accept', '*/*')
        httpObj.putheader('User-Agent', useragentSelf)
        httpObj.endheaders()
        response = httpObj.getresponse()
        contentType = response.getheader("content-type", "unknown")
        httpObj.close();

        if response.status != 200:
            if response.status == 301:
                #moved permanently - read location
                return urlIsOk(response.getheader("location"))
            if response.status == 302:
                #handle redirect
                return urlIsOk(response.getheader("location"))
            else:
                #server error message
                return (False, contentType, response.status)
        else:
            #Server reports url is OK.
            return (True, contentType, 200)

    except Exception:
        return (False, "unknown", 999)



def checkUrl(url):
    global currentUrl

    result = urlIsOk(url)
    if result[0]:
        #determine if link is crawlable
        if isParsable(result[1]):
            return True
        else:
            return False
    else:
        return False




#get html for a page
def getContent(url):
    try:
        contents = urllib2.urlopen(url).read()
        return contents
    except:
        return ""



#Get data
def printlinks(url, currentlevel):
    global recurselimit
    global pagetitle
    global parsedurls
    global currentUrl

    #Check if URL already parsed
    if not (stripFragment(url) in parsedurls):

        #check if url is ok
        if checkUrl(url):
            #Get doc
            currentUrl = url
            contents = getContent(url)

            #add title and url to list
            addUrlToHistory(url)

            #print url
            print url

            #recurse                                    
            links = htmldata.urlextract(contents, url)

            for u in links:
                printlinks(u.url, currentlevel)




#start script
printlinks(url, 0)