massvalidate.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Read list of links, call validator and store result in CSV file.

This script takes two arguments:
1. the name of a text file containing links (one link per line) to check.
2. the name of the ouput file where validation results are stored.

Usage example:
python massvalidate.py mylinks.txt resultfile.csv

This script was written in a haste. Please report errors to
pete@standards-schmandards.com

"""

__author__ = 'Peter Krantz'
__version__ = '0.1'
__date__ = '2005/04/02'

import urllib, time
import xml.dom.minidom
from xml.dom.minidom import Node, parse, parseString
import sys
import string
import csv

# Get child node value
def GetNodeFirstChildValue(NodeCollection):
    if NodeCollection.length > 0:
        if NodeCollection[0].firstChild != None:
            return NodeCollection[0].firstChild.data
        else:
            return ""
    else:
        return ""


#Get command line parameters...

#Filename containing urls to check
linkfilename = sys.argv[1]

#Filename of result file
resultfilename = sys.argv[2]

#Open the file to read list of links to check
linkFile = file(linkfilename,"r")

#Set up format for csv output
class MyDialect(csv.excel):
    delimiter = ';'
    doublequote = True
    quotechar = '"'
    quoting = csv.QUOTE_ALL

#Open the result file
resultFile = csv.writer(file(resultfilename, "w"), dialect=MyDialect)


#Url to validator check script. Replace the ip with the address of
#the machine where you installed you version of the validator.
validatorUrl = "http://127.0.0.1/validator/htdocs/check?uri="

# Current site identifier
siteid = 0


# Call local validator for each item in file
for line in linkFile.readlines():
    #call validator if line != blank    
    #write where we are now
    print "Trying " + line
    urlstring = validatorUrl + string.strip(line) + "&output=xml"

    f = urllib.urlopen(urlstring)
    valresultXml = f.read()

    try:
        #parse xml and get result
        DomDoc = parseString(valresultXml)

        #check if error was received (validator could not validate)
        validatorError = GetNodeFirstChildValue(DomDoc.documentElement.getElementsByTagName("error"))

        print "validatorError: " + str(validatorError)

        if len(str(validatorError)) > 0:
            #error occurred
            resultFile.writerow(( str(siteid), string.strip(line), "validatorerror", "", "", "" ))
            siteid = siteid + 1
            continue

        #collect data from validator result xml

        #errorcount
        errorCount = DomDoc.documentElement.getElementsByTagName("msg").length

        print "errorCount: " + str(errorCount)

        #server
        server = GetNodeFirstChildValue(DomDoc.documentElement.getElementsByTagName("server"))

        print "server: " + server

        #has dublin core?
        hasDc = GetNodeFirstChildValue(DomDoc.documentElement.getElementsByTagName("hasdc"))

        print "hasDc: " + hasDc

        #doctype
        docType = GetNodeFirstChildValue(DomDoc.documentElement.getElementsByTagName("doctype"))

        print "docType: " + docType

        #write result to output file...
        resultFile.writerow(( str(siteid), string.strip(line), errorCount, server, hasDc, docType ))

        #Close DomDoc
        DomDoc.unlink()

    except Exception:
        #Could not parse result as xml - probably because an error occurred when the validator read the html.
        resultFile.writerow(( str(siteid), string.strip(line), "error", "", "", "" ))

        print "Could not check."

    #Close html doc
    f.close()

    siteid = siteid + 1

# Close input file
linkFile.close()


print "Done!"