colorize.py

# -*- coding: utf-8 -*-
"""
    Colorize - Python source formatter that outputs python code in XHTML.
    This script is based on MoinMoin - The Python Source Parser.

    Usage:
    colorize.py [source file name] [optional author name]
"""

# Imports
import cgi
import string
import sys
import cStringIO
import keyword
import token
import tokenize
import re
import os

#Filepath of source file from command line parameter.  
sourcefile = sys.argv[1]

#Get file name of source file.  
filename = os.path.split(sourcefile)[1]

#Get optional author name parameter (it is added to the DC metadata)
if len(sys.argv)> 2:
    authorname = sys.argv[2]
else:
    authorname = "Unknown"

#Set up basic values.  
_KEYWORD = token.NT_OFFSET + 1
_TEXT    = token.NT_OFFSET + 2

_classes = {
    token.NUMBER:       'token_number',
    token.OP:           'token_op',
    token.STRING:       'token_string',
    tokenize.COMMENT:   'token_comment',
    token.NAME:         'token_name',
    token.ERRORTOKEN:   'token_error',
    _KEYWORD:           'keyword',
    _TEXT:              'text',
}

_DEFAULTENCODING = "utf-8"

#Define start of XHtml document.  
docstart = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
    <head>"""


#Add css from file to output document.  
docstart += """\n\t<style type="text/css">\n"""
cssfile = open("colorize.css", "r")
docstart += cssfile.read()
docstart += """\n\t</style>\n"""

#Close Xhtml document.  
docend = "\n\t</body>\n</html>"

#Set default encoding for output document.  
docencoding = _DEFAULTENCODING


def getEncodingOfFile(sourcefile):
    """Get encoding of source file. If no encoding found, returns _DEFAULTENCODING."""
    myfile = file(sourcefile)
    line = myfile.readline()

    #try line 1
    encoding = parseEncoding(line)

    #encoding found?
    if encoding != "":
        return encoding

    #if not - try line two
    line = myfile.readline()
    encoding = parseEncoding(line)

    #encoding found?
    if encoding != "":
        return encoding

    #if not - try BOM
    if myfile.encoding != None:
        return myfile.encoding

    #if not - return default encoding
    return _DEFAULTENCODING



def parseEncoding(textline):
    """Parse encoding from textline."""
    #Check line for encoding match
    regex = re.compile("coding[=:]\s*([-\w.]+)")
    match = regex.search(textline, 1)

    if match != None:
        #Return found encoding
        return match.group(1)
    else:
        #return default
        return ""



class Parser:
    """ Send colored python source.
    """

    def __init__(self, raw, out = sys.stdout):
        """ Store the source text.
        """
        self.raw = string.strip(string.expandtabs(raw))
        self.out = out

    def format(self, formatter, form):
        """ Parse and send the colored source. Med öäå.
        """
        # store line offsets in self.lines
        self.lines = [0, 0]
        pos = 0
        while 1:
            pos = string.find(self.raw, '\n', pos) + 1
            if not pos: break
            self.lines.append(pos)
        self.lines.append(len(self.raw))

        # parse the source and write it
        self.out.write(docstart)

        # write metadata
        self.out.write("\n<title>" + filename + "</title>\n")
        self.out.write('<link rel="schema.DC" href="http://purl.org/DC/elements/1.0" />\n')
        self.out.write('<meta name="DC.Language" content="en" />\n')
        self.out.write('<meta name="DC.Format" content="text/html" />\n')
        self.out.write('<meta name="DC.Type" content="Software" />\n')
        self.out.write('<meta name="DC.Title" content="Python source of %s" />\n' % filename)
        self.out.write('<meta name="DC.Creator" content="%s" />\n' % authorname)

        #encoding
        self.out.write('<meta http-equiv="Content-Type" content="text/html; charset=%s" />\n' % docencoding)

        #Close head and begin body.  
        self.out.write("\n\t</head>\n<body>")

        self.pos = 0
        text = cStringIO.StringIO(self.raw)
        self.out.write('<pre><code>')
        try:
            tokenize.tokenize(text.readline, self)
        except tokenize.TokenError, ex:
            msg = ex[0]
            line = ex[1][0]
            self.out.write("<h3>ERROR: %s</h3>%s\n" % (
                msg, self.raw[self.lines[line]:]))
        self.out.write('</code></pre>\n')

        self.out.write(docend)

    def __call__(self, toktype, toktext, (srow,scol), (erow,ecol), line):
        """ Token handler.
        """
        if 0:
            print "type", toktype, token.tok_name[toktype], "text", toktext,
            print "start", srow, scol, "end", erow, ecol, "<br />"

        # calculate new positions
        oldpos = self.pos
        newpos = self.lines[srow] + scol
        self.pos = newpos + len(toktext)

        # handle newlines
        if toktype in [token.NEWLINE, tokenize.NL]:
            self.out.write('\n')
            return

        # send the original whitespace, if needed
        if newpos > oldpos:
            self.out.write(self.raw[oldpos:newpos])

        # skip indenting tokens
        if toktype in [token.INDENT, token.DEDENT]:
            self.pos = newpos
            return

        # map token type to a color/class group
        if token.LPAR <= toktype and toktype <= token.OP:
            toktype = token.OP
        elif toktype == token.NAME and keyword.iskeyword(toktext):
            toktype = _KEYWORD
        classval = _classes.get(toktype, _classes[_TEXT])

        style = ''
        if toktype == token.ERRORTOKEN:
            style = ' style="border: solid 1.5pt #FF0000;"'

        # send text
        self.out.write('<span class="%s"%s>' % (classval, style))
        self.out.write(cgi.escape(toktext))
        self.out.write('</span>')


if __name__ == "__main__":
    import os, sys
    print "Formatting " + sourcefile

    #Set up encoding
    docencoding = getEncodingOfFile(sourcefile)

    # open own source
    source = open(sourcefile).read()

    # write colorized version to "[filename].py.html"
    Parser(source, open(sourcefile + '.html', 'wt')).format(None, None)

    # done!
    print "Done! Wrote result file " + sourcefile + ".html"