Peter's Blog

Redefining the Impossible

Syntax Highlighting python in python


This python code generates syntax highlighted python code in html format. I know about SilverCity but I want this for my Site5 account where I cannot install executable code. The code below was highlighted using the code itself: spooky.

It is a simplistic solution but it should not be confused by multiline strings, comment characters in strings etc. I started off by trying to use the ply python lex as a tokeniser and processing the tokens but that persisted in confusing multiline string characters with normal strings and while thinking about it I realised that I could live without it. I don't know how slow this is: if using it on a website with heavy traffic you will want to cache the output.

#
# Syntax Highlighting
#

import re
import cgi

# Regular expression rules for simple tokens
strStyles = (
    ('PUNC', re.compile( r'<<|>>|<=|>=|!=|==|[-+*|^~/%=<>\[\]{}(),.:]'), None),
    ('NUMBER', re.compile( r'0x[0-9a-fA-F]+|[+-]?\d+(.\d+)?([eE][+-]\d+)?|\d+'),
                            'color: red'),
    ('KEYWORD', re.compile( r'def|class|break|continue|del|exec|finally|pass|' +
                            r'print|raise|return|try|except|global|assert|lambda|' +
                            r'yield|for|while|if|elif|else|and|in|is|not|or|import|' +
                            r'from|True|False'), 'font-weight: bold'),
    ('MULTILINE', re.compile( r'r?u?(\'\'\'|""")'), 'color: darkred'),
    ('STRING', re.compile( r'r?u?\'(.*?)(?<!\\)\'|"(.*?)(?<!\\)"'), 'color: red'),
    ('IDENTIFIER', re.compile( r'[a-zA-Z_][a-zA-Z0-9_]*'), None),
    ('COMMENT', re.compile( r'\#.*\r?\n'), 'color: green; font-style: italic'),
    ('WHITESPACE', re.compile( r'[ \t\r\n]+'), None),

# if all else fails...
    ('UNKNOWN', re.compile( r'.'), None)
)

class Highlight:
    """
    Syntax highlight some python code.
    """
    def __init__( self):
        self.strOutput = []
        self.strSpanStyle = None

    def Highlight( self, strData):
        """
        Syntax highlight some python code.
        Returns html version of code.
        """

        i = 0
        strMultiline = ''

        #
        # While input is not exhausted...
        #
        while i < len(data):
            #
            # Compare current position with all possible display types.
            #
            for strTok, oRE, strStyle in strStyles:
                oMatch = oRE.match( data, i)
                if oMatch:
                    #
                    # Input matches this type.
                    #
                    strValue = cgi.escape( oMatch.group())
                    if strTok == 'MULTILINE':
                        #
                        # Multiline string token
                        #
                        if strMultiline == '':
                            #
                            # If not inside a multiline string then start one now.
                            #
                            self.ChangeStyle( strStyle)
                            self.strOutput.append( strValue)
                            #
                            # Remember you are in a string and remember how it was
                            # started (""" vs ''')
                            #
                            strMultiline = oMatch.group(1)
                        else:
                            #
                            # Multiline Token found within a multiline string
                            #
                            if oMatch.group() == strMultiline:
                                #
                                # Token is end of multiline so stop here.
                                #
                                self.strOutput.append( strMultiline)
                                strMultiline = ''

                            else:
                                #
                                # Not the same multiline token as started so just output it
                                #
                                self.strOutput.append( strValue)
                    else:
                        #
                        # Other token, not multiline
                        #
                        if strMultiline != '':
                            #
                            # In multiline mode so output the raw text of the token
                            #
                            self.strOutput.append( strValue)
                        else:
                            #
                            # Not in multiline mode so change display style as appropriate
                            # and output the text.
                            #
                            self.ChangeStyle( strStyle)
                            self.strOutput.append( strValue)
                    i += len( oMatch.group())
                    break
            else:
                #
                # Token not found so dump out raw text. This doesn't have to be bullet proof.
                #
                self.ChangeStyle( None)
                self.strOutput.append( data[i])
                i += 1

        #
        # Terminate any styles in use.
        #
        self.ChangeStyle( None)

        return "".join( self.strOutput)

    def ChangeStyle( self, strStyle):
        """
        Generate output to change from existing style to another style only.
        """

        #
        # Output minimal formatting code: only output anything is the style has
        # actually  changed.
        #
        if self.strSpanStyle != strStyle:
            if self.strSpanStyle != None:
                self.strOutput.append( '</span>')
            if strStyle != None:
                self.strOutput.append( '<span style="%s">' % strStyle)
            self.strSpanStyle = strStyle

Used like this:

import sys
data = open( sys.argv[0]).read()
strHighlighted = Highlight().Highlight( data)

print """<html>

<head>
<title>It works</title>
</head>
<body>
<pre>
%s
</pre>
</body>

</head>
""" % strHighlighted

Filed under: hosting php python site5

Peter Says:

New and improved:

  • more optimised output
  • C, C++, PHP and HTML
  • PHP embedded in HTML and vice versa
#
# Syntax Highlighting
#

import re
import cgi

class Highlight:
    """
    Do syntax highlighting.
    """
    def __init__( self, strMode):
        """
        Initialise highlighter: strMode = language (Python, C, CPP, PHP, HTML)
        """
        self.strOutput = []
        self.strSpanStyle = None
        if strMode == 'CPP':
            strMode = 'C'
            self.strSuppressTokens = []
        elif strMode == 'C':
            self.strSuppressTokens = ['CPPKEYWORD']
        else:
            self.strSuppressTokens = []

        self.strMode = strMode

    def PythonHighlightToken( self, strTok, oMatch, strStyle):
        """
        Callback for python specific highlighting.
        """
        #
        # Input matches this type.
        #
        strValue = cgi.escape( oMatch.group())

        if strTok == 'MULTILINESTRING':
            #
            # If not inside a multiline string then start one now.
            #
            self.ChangeStyle( strStyle)
            self.strOutput.append( strValue)
            #
            # Remember you are in a string and remember how it was
            # started (""" vs ''')
            #
            self.strMultilineString = oMatch.group(1)
            return 'PythonMultilineString'

        elif strTok == 'ENDMULTILINESTRING':
            #
            # Multiline Token found within a multiline string
            #
            if oMatch.group(1) == self.strMultilineString:
                #
                # Token is end of multiline so stop here.
                #
                self.strOutput.append( strValue)
                self.strMultilineString = ''
                return 'Python'

        self.ChangeStyle( strStyle)
        self.strOutput.append( strValue)

    def CHighlightToken( self, strTok, oMatch, strStyle):
        """
        Callback for C specific highlighting.
        """
        #
        # Input matches this type.
        #
        strValue = cgi.escape( oMatch.group())

        #
        # Not in multiline mode so change display style as appropriate
        # and output the text.
        #
        self.ChangeStyle( strStyle)
        self.strOutput.append( strValue)

    def PHPHighlightToken( self, strTok, oMatch, strStyle):
        """
        Callback for PHP specific highlighting.
        """
        #
        # Input matches this type.
        #
        strValue = cgi.escape( oMatch.group())

        if strTok == 'MULTILINESTRING':
            #
            # If not inside a multiline string then start one now.
            #
            self.ChangeStyle( strStyle)
            self.strOutput.append( strValue)
            #
            # Remember you are in a string and remember how it was
            # started (""" vs ''')
            #
            self.strMultilineString = oMatch.group(1)
            return 'PHPMultilineString'

        elif strTok == 'ENDMULTILINESTRING':
            #
            # Multiline Token found within a multiline string
            #
            if oMatch.group(1) == self.strMultilineString:
                #
                # Token is end of multiline so stop here.
                #
                self.strOutput.append( strValue)
                self.strMultilineString = ''
                return 'PHP'

        self.ChangeStyle( strStyle)
        self.strOutput.append( strValue)

        if strTok == 'GOTOHTML':
            #
            # Embedded HTML
            #
            return 'HTML'
        else:
            return None

    def HTMLHighlightToken( self, strTok, oMatch, strStyle):
        """
        Callback for HTML specific highlighting.
        """
        #
        # Input matches this type.
        #
        strValue = cgi.escape( oMatch.group())
        self.ChangeStyle( strStyle)
        self.strOutput.append( strValue)

        if strTok == 'TAG':
            #
            # Change to mode 1, 'within tag'.
            #
            return 'HTMLTag'

        elif strTok == 'ENDTAG':
            #
            # Change to mode 1, 'within tag'.
            #
            return 'HTML'

        elif strTok == 'GOTOPHP':
            #
            # Embedded PHP
            #
            return 'PHP'

        else:
            #
            # No state change.
            #
            return None

    oStyles = {
        'Python': ( PythonHighlightToken,
            (
                ('PUNC', re.compile( r'[-+*!|&^~/%\=<>\[\]{}(),.:]'), 'font-weight: bold'),
                ('NUMBER', re.compile( r'0x[0-9a-fA-F]+|[+-]?\d+(\.\d+)?([eE][+-]\d+)?|\d+'),
                                        'color: red'),
                ('KEYWORD', re.compile( r'(def|class|break|continue|del|exec|finally|pass|' +
                                        r'print|raise|return|try|except|global|assert|lambda|' +
                                        r'yield|for|while|if|elif|else|and|in|is|not|or|import|' +
                                        r'from|True|False)(?![a-zA-Z0-9_])'), 'color: blue; font-weight: bold'),
                ('MAGIC', re.compile( r'self|None'), 'color: blue'),
                ('MULTILINESTRING', re.compile( r'r?u?(\'\'\'|""")'), 'color: darkred'),
                ('STRING', re.compile( r'r?u?\'(.*?)(?<!\\)\'|"(.*?)(?<!\\)"'), 'color: red'),
                ('IDENTIFIER', re.compile( r'[a-zA-Z_][a-zA-Z0-9_]*'), None),
                ('COMMENT', re.compile( r'\#.*\r?\n'), 'color: green; font-style: italic'),
                ('WHITESPACE', re.compile( r'[ \t\r\n]+'), 'Keep'),
            # if all else fails...
                ('UNKNOWN', re.compile( r'.'), None)
            )),

        'PythonMultilineString': ( PythonHighlightToken,
            (
                ('ENDMULTILINESTRING', re.compile( r'.*?("""|\'\'\')', re.DOTALL), 'color: darkred'),
                ('UNKNOWN', re.compile( r'.'), 'Keep')
            )),

        'C': ( CHighlightToken,
            (
                ('COMMENT', re.compile( r'//.*\r?\n'), 'color: green; font-style: italic'),
                ('MULTILINECOMMENT', re.compile( r'/\*.*?\*/', re.DOTALL), 'color: green; font-style: italic'),
                ('PREPROCESSOR', re.compile( r'\s*#.*?[^\\]\s*\n', re.DOTALL), 'color: magenta; font-style: italic'),
                ('PUNC', re.compile( r'[-+*!&|^~/%\=<>\[\]{}(),.:]'), 'font-weight: bold'),
                ('NUMBER', re.compile( r'0x[0-9a-fA-F]+|[+-]?\d+(\.\d+)?([eE][+-]\d+)?|\d+'),
                                        'color: red'),
                ('KEYWORD', re.compile( r'(sizeof|int|long|short|char|void|' +
                                        r'signed|unsigned|float|double|' +
                                        r'goto|break|return|continue|asm|' +
                                        r'case|default|if|else|switch|while|for|do|' +
                                        r'struct|union|enum|typedef|' +
                                        r'static|register|auto|volatile|extern|const)(?![a-zA-Z0-9_])'), 'color: blue; font-weight: bold'),
                ( 'CPPKEYWORD', re.compile( r'(class|private|protected|public|template|new|delete|' +
                                            r'this|friend|using|inline|export|bool|throw|try|catch|' +
                                            r'operator|typeid|virtual)(?![a-zA-Z0-9_])'), 'color: blue; font-weight: bold'),
                ('STRING', re.compile( r'r?u?\'(.*?)(?<!\\)\'|"(.*?)(?<!\\)"'), 'color: red'),
                ('IDENTIFIER', re.compile( r'[a-zA-Z_][a-zA-Z0-9_]*'), None),
                ('WHITESPACE', re.compile( r'[ \t\r\n]+'), 'Keep'),
                ('UNKNOWN', re.compile( r'.'), None)
            )),

        'PHP': ( PHPHighlightToken,
            (
                ('COMMENT', re.compile( r'//.*\r?\n'), 'color: green; font-style: italic'),
                ('MULTILINECOMMENT', re.compile( r'/\*.*?\*/', re.DOTALL), 'color: green; font-style: italic'),
                ('MULTILINESTRING', re.compile( r'<<<\s*([a-zA-Z0-9_]+)'), 'color: darkred'),
                ('GOTOPHP', re.compile( r'<\?php'), 'color: red'),
                ('PUNC', re.compile( r'[-+*!&|^~/%\=<>\[\]{}(),.:]'), 'font-weight: bold'),
                ('NUMBER', re.compile( r'0x[0-9a-fA-F]+|[+-]?\d+(\.\d+)?([eE][+-]\d+)?|\d+'),
                                        'color: red'),
                ('KEYWORD', re.compile( r'(declare|else|enddeclare|endswitch|elseif|endif|if|switch|' +
                                        r'as|do|endfor|endforeach|endwhile|for|foreach|while|' +
                                        r'case|default|switch|function|return|break|continue|exit|' +
                                        r'var|const|boolean|bool|integer|int|real|double|float|string|' +
                                        r'array|object|NULL|extends|implements|instanceof|parent|self|' +
                                        r'include|require|include_once|require_once|new|true|false)(?![a-zA-Z0-9_])'), 'color: blue; font-weight: bold'),

                ('STRING', re.compile( r'r?u?\'(.*?)(?<!\\)\'|"(.*?)(?<!\\)"'), 'color: red'),
                ('VARIABLE', re.compile( r'\$[a-zA-Z_][a-zA-Z0-9_]*'), 'color:blue'),
                ('IDENTIFIER', re.compile( r'[a-zA-Z_][a-zA-Z0-9_]*'), None),
                ('WHITESPACE', re.compile( r'[ \t\r\n]+'), 'Keep'),
                ('GOTOHTML', re.compile( r'\?>'), 'color: red'),
                ('UNKNOWN', re.compile( r'.'), None)
            )),

        'PHPMultilineString': ( PHPHighlightToken,
            (
                ('ENDMULTILINESTRING', re.compile( r'.*?\n([a-zA-Z0-9_]+)', re.DOTALL), 'color: darkred'),
                ('UNKNOWN', re.compile( r'.*?(?!\n)'), 'Keep')
            )),

        'HTML': ( HTMLHighlightToken,
            # Mode 0: just look for tags
            (
                ('COMMENT', re.compile( r'<!--[^>]*-->|<!>'), 'color: green; font-style: italic'),
                ('XMLCRAP', re.compile( r'<![^>]*>'), 'color: blue; font-style: italic'),
                ('SCRIPT', re.compile( r'<script .*?</script>', re.IGNORECASE + re.DOTALL), 'color: black'),
                ('TAG', re.compile( r'</?\s*[a-zA-Z0-9]+'), 'color: darkred; font-weight: bold'),
                ('GOTOPHP', re.compile( r'<\?php'), 'color: red'),
                ('UNKNOWN', re.compile( r'[^<]*'), None)
            )),
            # Mode 1: within tags,
        'HTMLTag': ( HTMLHighlightToken,
            (
                ('ENDTAG', re.compile( r'>'), 'color: darkred; font-weight: bold'),
                ('ATTRIBUTE', re.compile( r'[a-zA-Z][a-zA-Z0-9:]*='), "color: green; font-weight: bold"),
                ('VALUE', re.compile( r'"[^"]*"'), "color: red"),
                ('WHITESPACE', re.compile( r'[\s]+'), None),
                ('UNKNOWN', re.compile( r'.'), None)
            ))
    }

    def Highlight( self, strData):
        """
        Syntax highlight some python code.
        Returns html version of code.
        """
        i = 0

        strMode = self.strMode

        #
        # While input is not exhausted...
        #
        while i < len(strData):
            #
            # Compare current position with all possible display types.
            #
            try:
                for strTok, oRE, strStyle in Highlight.oStyles[strMode][1]:
                    if not strTok in self.strSuppressTokens:
                        oMatch = oRE.match( strData, i)
                        if oMatch:
                            strNewMode = Highlight.oStyles[strMode][0]( self, strTok, oMatch, strStyle)
                            if strNewMode != None:
                                strMode = strNewMode

                            i += len( oMatch.group())
                            break
                else:
                    #
                    # Token not found so dump out raw text. This doesn't have to be bullet proof.
                    #
                    self.ChangeStyle( None)
                    self.strOutput.append( strData[i])
                    i += 1
            except:
                raise
        #
        # Terminate any styles in use.
        #
        self.ChangeStyle( None)

        #
        # Expand tabs to 4 spaces.
        # Doesn't matter if this number is wrong, the indentation will be butt ugly anyhow.
        #
        return "".join( self.strOutput).expandtabs(4)

    def ChangeStyle( self, strStyle):
        """
        Generate output to change from existing style to another style only.
        """
        #
        # Output minimal formatting code: only output anything is the style has
        # actually  changed.
        #
        if self.strSpanStyle != strStyle:
            if strStyle != 'Keep':
                if self.strSpanStyle != None:
                    self.strOutput.append( '</span>')
                if strStyle != None:
                    self.strOutput.append( '<span style="%s">' % strStyle)
                self.strSpanStyle = strStyle

Peter

Have Your Say

I welcome constructive comments or questions but I reserve the right to delete any comments that displease me.

Who are you?

(Optional) If you enter an email address here I might email you back. Your email address will not be sold to spammers or shown anywhere

What do you have to say?