blob: 0cf21ce03f3f422da7dcb149705c9ebf2fc2c128 [file] [log] [blame]
#
# sources.py
#
# Convert source code comments to multi-line blocks (library file).
#
# Copyright 2002-2015 by
# David Turner.
#
# This file is part of the FreeType project, and may only be used,
# modified, and distributed under the terms of the FreeType project
# license, LICENSE.TXT. By continuing to use, modify, or distribute
# this file you indicate that you have read the license and
# understand and accept it fully.
#
# This library file contains definitions of classes needed to decompose C
# source code files into a series of multi-line `blocks'. There are two
# kinds of blocks.
#
# - Normal blocks, which contain source code or ordinary comments.
#
# - Documentation blocks, which have restricted formatting, and whose text
# always start with a documentation markup tag like `<Function>',
# `<Type>', etc.
#
# The routines to process the content of documentation blocks are contained
# in file `content.py'; the classes and methods found here only deal with
# text parsing and basic documentation block extraction.
#
import fileinput, re, sys, os, string
################################################################
##
## SOURCE BLOCK FORMAT CLASS
##
## A simple class containing compiled regular expressions to detect
## potential documentation format block comments within C source code.
##
## The `column' pattern must contain a group to `unbox' the content of
## documentation comment blocks.
##
## Later on, paragraphs are converted to long lines, which simplifies the
## regular expressions that act upon the text.
##
class SourceBlockFormat:
def __init__( self, id, start, column, end ):
"""Create a block pattern, used to recognize special documentation
blocks."""
self.id = id
self.start = re.compile( start, re.VERBOSE )
self.column = re.compile( column, re.VERBOSE )
self.end = re.compile( end, re.VERBOSE )
#
# Format 1 documentation comment blocks.
#
# /************************************/ (at least 2 asterisks)
# /* */
# /* */
# /* */
# /************************************/ (at least 2 asterisks)
#
start = r'''
\s* # any number of whitespace
/\*{2,}/ # followed by '/' and at least two asterisks then '/'
\s*$ # probably followed by whitespace
'''
column = r'''
\s* # any number of whitespace
/\*{1} # followed by '/' and precisely one asterisk
([^*].*) # followed by anything (group 1)
\*{1}/ # followed by one asterisk and a '/'
\s*$ # probably followed by whitespace
'''
re_source_block_format1 = SourceBlockFormat( 1, start, column, start )
#
# Format 2 documentation comment blocks.
#
# /************************************ (at least 2 asterisks)
# *
# * (1 asterisk)
# *
# */ (1 or more asterisks)
#
start = r'''
\s* # any number of whitespace
/\*{2,} # followed by '/' and at least two asterisks
\s*$ # probably followed by whitespace
'''
column = r'''
\s* # any number of whitespace
\*{1}(?![*/]) # followed by precisely one asterisk not followed by `/'
(.*) # then anything (group1)
'''
end = r'''
\s* # any number of whitespace
\*+/ # followed by at least one asterisk, then '/'
'''
re_source_block_format2 = SourceBlockFormat( 2, start, column, end )
#
# The list of supported documentation block formats. We could add new ones
# quite easily.
#
re_source_block_formats = [re_source_block_format1, re_source_block_format2]
#
# The following regular expressions correspond to markup tags within the
# documentation comment blocks. They are equivalent despite their different
# syntax.
#
# A markup tag consists of letters or character `-', to be found in group 1.
#
# Notice that a markup tag _must_ begin a new paragraph.
#
re_markup_tag1 = re.compile( r'''\s*<((?:\w|-)*)>''' ) # <xxxx> format
re_markup_tag2 = re.compile( r'''\s*@((?:\w|-)*):''' ) # @xxxx: format
#
# The list of supported markup tags. We could add new ones quite easily.
#
re_markup_tags = [re_markup_tag1, re_markup_tag2]
#
# A regular expression to detect a cross reference, after markup tags have
# been stripped off. Group 1 is the reference, group 2 the rest of the
# line.
#
# A cross reference consists of letters, digits, or characters `-' and `_'.
#
re_crossref = re.compile( r'@((?:\w|-)*)(.*)' ) # @foo
#
# Two regular expressions to detect italic and bold markup, respectively.
# Group 1 is the markup, group 2 the rest of the line.
#
# Note that the markup is limited to words consisting of letters, digits,
# the character `_', or an apostrophe (but not as the first character).
#
re_italic = re.compile( r"_(\w(?:\w|')*)_(.*)" ) # _italic_
re_bold = re.compile( r"\*(\w(?:\w|')*)\*(.*)" ) # *bold*
#
# This regular expression code to identify an URL has been taken from
#
# http://mail.python.org/pipermail/tutor/2002-September/017228.html
#
# (with slight modifications).
#
urls = r'(?:https?|telnet|gopher|file|wais|ftp)'
ltrs = r'\w'
gunk = r'/#~:.?+=&%@!\-'
punc = r'.:?\-'
any = "%(ltrs)s%(gunk)s%(punc)s" % { 'ltrs' : ltrs,
'gunk' : gunk,
'punc' : punc }
url = r"""
(
\b # start at word boundary
%(urls)s : # need resource and a colon
[%(any)s] +? # followed by one or more of any valid
# character, but be conservative and
# take only what you need to...
(?= # [look-ahead non-consumptive assertion]
[%(punc)s]* # either 0 or more punctuation
(?: # [non-grouping parentheses]
[^%(any)s] | $ # followed by a non-url char
# or end of the string
)
)
)
""" % {'urls' : urls,
'any' : any,
'punc' : punc }
re_url = re.compile( url, re.VERBOSE | re.MULTILINE )
#
# A regular expression that stops collection of comments for the current
# block.
#
re_source_sep = re.compile( r'\s*/\*\s*\*/' ) # /* */
#
# A regular expression to find possible C identifiers while outputting
# source code verbatim, covering things like `*foo' or `(bar'. Group 1 is
# the prefix, group 2 the identifier -- since we scan lines from left to
# right, sequentially splitting the source code into prefix and identifier
# is fully sufficient for our purposes.
#
re_source_crossref = re.compile( r'(\W*)(\w*)' )
#
# A regular expression that matches a list of reserved C source keywords.
#
re_source_keywords = re.compile( '''\\b ( typedef |
struct |
enum |
union |
const |
char |
int |
short |
long |
void |
signed |
unsigned |
\#include |
\#define |
\#undef |
\#if |
\#ifdef |
\#ifndef |
\#else |
\#endif ) \\b''', re.VERBOSE )
################################################################
##
## SOURCE BLOCK CLASS
##
## There are two important fields in a `SourceBlock' object.
##
## self.lines
## A list of text lines for the corresponding block.
##
## self.content
## For documentation comment blocks only, this is the block content
## that has been `unboxed' from its decoration. This is `None' for all
## other blocks (i.e., sources or ordinary comments with no starting
## markup tag)
##
class SourceBlock:
def __init__( self, processor, filename, lineno, lines ):
self.processor = processor
self.filename = filename
self.lineno = lineno
self.lines = lines[:]
self.format = processor.format
self.content = []
if self.format == None:
return
words = []
# extract comment lines
lines = []
for line0 in self.lines:
m = self.format.column.match( line0 )
if m:
lines.append( m.group( 1 ) )
# now, look for a markup tag
for l in lines:
l = string.strip( l )
if len( l ) > 0:
for tag in re_markup_tags:
if tag.match( l ):
self.content = lines
return
def location( self ):
return "(" + self.filename + ":" + repr( self.lineno ) + ")"
# debugging only -- not used in normal operations
def dump( self ):
if self.content:
print "{{{content start---"
for l in self.content:
print l
print "---content end}}}"
return
fmt = ""
if self.format:
fmt = repr( self.format.id ) + " "
for line in self.lines:
print line
################################################################
##
## SOURCE PROCESSOR CLASS
##
## The `SourceProcessor' is in charge of reading a C source file and
## decomposing it into a series of different `SourceBlock' objects.
##
## A SourceBlock object consists of the following data.
##
## - A documentation comment block using one of the layouts above. Its
## exact format will be discussed later.
##
## - Normal sources lines, including comments.
##
##
class SourceProcessor:
def __init__( self ):
"""Initialize a source processor."""
self.blocks = []
self.filename = None
self.format = None
self.lines = []
def reset( self ):
"""Reset a block processor and clean up all its blocks."""
self.blocks = []
self.format = None
def parse_file( self, filename ):
"""Parse a C source file and add its blocks to the processor's
list."""
self.reset()
self.filename = filename
fileinput.close()
self.format = None
self.lineno = 0
self.lines = []
for line in fileinput.input( filename ):
# strip trailing newlines, important on Windows machines!
if line[-1] == '\012':
line = line[0:-1]
if self.format == None:
self.process_normal_line( line )
else:
if self.format.end.match( line ):
# A normal block end. Add it to `lines' and create a
# new block
self.lines.append( line )
self.add_block_lines()
elif self.format.column.match( line ):
# A normal column line. Add it to `lines'.
self.lines.append( line )
else:
# An unexpected block end. Create a new block, but
# don't process the line.
self.add_block_lines()
# we need to process the line again
self.process_normal_line( line )
# record the last lines
self.add_block_lines()
def process_normal_line( self, line ):
"""Process a normal line and check whether it is the start of a new
block."""
for f in re_source_block_formats:
if f.start.match( line ):
self.add_block_lines()
self.format = f
self.lineno = fileinput.filelineno()
self.lines.append( line )
def add_block_lines( self ):
"""Add the current accumulated lines and create a new block."""
if self.lines != []:
block = SourceBlock( self,
self.filename,
self.lineno,
self.lines )
self.blocks.append( block )
self.format = None
self.lines = []
# debugging only, not used in normal operations
def dump( self ):
"""Print all blocks in a processor."""
for b in self.blocks:
b.dump()
# eof