This python code generates syntax highlighted python code in html format. I know about SilverCity but I want this for my Site5 account where I cannot install executable code. The code below was highlighted using the code itself: spooky.
It is a simplistic solution but it should not be confused by multiline strings, comment characters in strings etc. I started off by trying to use the ply python lex as a tokeniser and processing the tokens but that persisted in confusing multiline string characters with normal strings and while thinking about it I realised that I could live without it. I don't know how slow this is: if using it on a website with heavy traffic you will want to cache the output.
# # Syntax Highlighting # import re import cgi # Regular expression rules for simple tokens strStyles = ( ('PUNC', re.compile( r'<<|>>|<=|>=|!=|==|[-+*|^~/%=<>\[\]{}(),.:]'), None), ('NUMBER', re.compile( r'0x[0-9a-fA-F]+|[+-]?\d+(.\d+)?([eE][+-]\d+)?|\d+'), 'color: red'), ('KEYWORD', re.compile( r'def|class|break|continue|del|exec|finally|pass|' + r'print|raise|return|try|except|global|assert|lambda|' + r'yield|for|while|if|elif|else|and|in|is|not|or|import|' + r'from|True|False'), 'font-weight: bold'), ('MULTILINE', re.compile( r'r?u?(\'\'\'|""")'), 'color: darkred'), ('STRING', re.compile( r'r?u?\'(.*?)(?<!\\)\'|"(.*?)(?<!\\)"'), 'color: red'), ('IDENTIFIER', re.compile( r'[a-zA-Z_][a-zA-Z0-9_]*'), None), ('COMMENT', re.compile( r'\#.*\r?\n'), 'color: green; font-style: italic'), ('WHITESPACE', re.compile( r'[ \t\r\n]+'), None), # if all else fails... ('UNKNOWN', re.compile( r'.'), None) ) class Highlight: """ Syntax highlight some python code. """ def __init__( self): self.strOutput = [] self.strSpanStyle = None def Highlight( self, strData): """ Syntax highlight some python code. Returns html version of code. """ i = 0 strMultiline = '' # # While input is not exhausted... # while i < len(data): # # Compare current position with all possible display types. # for strTok, oRE, strStyle in strStyles: oMatch = oRE.match( data, i) if oMatch: # # Input matches this type. # strValue = cgi.escape( oMatch.group()) if strTok == 'MULTILINE': # # Multiline string token # if strMultiline == '': # # If not inside a multiline string then start one now. # self.ChangeStyle( strStyle) self.strOutput.append( strValue) # # Remember you are in a string and remember how it was # started (""" vs ''') # strMultiline = oMatch.group(1) else: # # Multiline Token found within a multiline string # if oMatch.group() == strMultiline: # # Token is end of multiline so stop here. # self.strOutput.append( strMultiline) strMultiline = '' else: # # Not the same multiline token as started so just output it # self.strOutput.append( strValue) else: # # Other token, not multiline # if strMultiline != '': # # In multiline mode so output the raw text of the token # self.strOutput.append( strValue) else: # # Not in multiline mode so change display style as appropriate # and output the text. # self.ChangeStyle( strStyle) self.strOutput.append( strValue) i += len( oMatch.group()) break else: # # Token not found so dump out raw text. This doesn't have to be bullet proof. # self.ChangeStyle( None) self.strOutput.append( data[i]) i += 1 # # Terminate any styles in use. # self.ChangeStyle( None) return "".join( self.strOutput) def ChangeStyle( self, strStyle): """ Generate output to change from existing style to another style only. """ # # Output minimal formatting code: only output anything is the style has # actually changed. # if self.strSpanStyle != strStyle: if self.strSpanStyle != None: self.strOutput.append( '</span>') if strStyle != None: self.strOutput.append( '<span style="%s">' % strStyle) self.strSpanStyle = strStyle
Used like this:
import sys data = open( sys.argv[0]).read() strHighlighted = Highlight().Highlight( data) print """<html> <head> <title>It works</title> </head> <body> <pre> %s </pre> </body> </head> """ % strHighlighted


New and improved:
Peter