blob: 8d5e5b4a2ffd31d6d425bcf36d2d93060042c720 [file] [log] [blame] [edit]
#!/usr/bin/python
# Copyright (C) 2018 Google Inc. All Rights Reserved.
"""Extracts licenses from a list of source files.
Outputs a NOTICES files to stdout and a status log to stderr.
"""
from __future__ import print_function
import argparse
import bisect
import logging
import os
import re
import sys
import textwrap
_COPYRIGHT_PATTERN = r'Copyright '
_COPYRIGHT_RE = re.compile(_COPYRIGHT_PATTERN)
_MEANINGFUL_LETTER_PATTERN = r'[a-zA-Z()]'
_MEANINGFUL_LETTER_RE = re.compile(_MEANINGFUL_LETTER_PATTERN)
_RCS_TAG_PATTERN = r'\$.*\$'
_RCS_TAG_RE = re.compile(_RCS_TAG_PATTERN)
_JUNK_PATTERN = r'@\(#\).*$'
_JUNK_RE = re.compile(_JUNK_PATTERN)
_JUNK2_PATTERN = r'###COPYRIGHTEND####'
_JUNK2_RE = re.compile(_JUNK2_PATTERN)
_JUNK3_PATTERN = r'citrus Id:.*Exp'
_JUNK3_RE = re.compile(_JUNK3_PATTERN)
_JUNK4_PATTERN = r'\s*from:\s*$'
_JUNK4_RE = re.compile(_JUNK4_PATTERN)
_JUNK5_PATTERN = r'%sccs.include.redist.c%'
_JUNK5_RE = re.compile(_JUNK5_PATTERN)
# The maxiumum number of columns to fill in the generated output.
_OUTPUT_COLUMNS = 79
# The maximum number of rows from the top of a source file to search for a
# Copyright statement. Beyond that, it is more likely that the Copyright
# statement refers to a small piece of embedded code than the whole file.
_MAXIMUM_COPYRIGHT_ROWS = 200
def _remove_junk(line):
clean_line = _RCS_TAG_RE.sub('', line)
clean_line = _JUNK_RE.sub('', clean_line)
clean_line = _JUNK2_RE.sub('', clean_line)
clean_line = _JUNK3_RE.sub('', clean_line)
clean_line = _JUNK4_RE.sub('', clean_line)
clean_line = _JUNK5_RE.sub('', clean_line)
return clean_line
def _read_lines(filename):
with open(filename, 'r') as f:
return [x.rstrip(' \t\r\n') for x in f.readlines()]
def _read_file_list(filename):
file_directory = os.path.dirname(filename)
return [os.path.abspath(os.path.join(file_directory, x)) for x
in _read_lines(filename)]
def _expand_directory(path):
fileset = set()
for root, _, files in os.walk(path):
for name in files:
fileset.add(os.path.join(root, name))
return fileset
def _flatten_file_list(files):
"""Flattens the given list of files by walking any specified directories."""
expanded_files = set()
file_list_files = [os.path.abspath(x[1:]) for x in files if x.startswith('@')]
files = [os.path.abspath(x) for x in files if not x.startswith('@')]
for path in file_list_files:
files += _read_file_list(path)
for path in [os.path.abspath(x) for x in files]:
if os.path.isdir(path):
expanded_files |= _expand_directory(path)
continue
if not os.path.exists(path):
logging.warning('File not found: %s', path)
continue
expanded_files.add(path)
return expanded_files
def _collect_notices(files, relative_directory):
"""Collects all the notices from files, returning a combined string."""
file_notice_map = {}
notice_file_map = {}
for filename in sorted(_flatten_file_list(files)):
notice = _find_notice(filename)
if not notice:
continue
if notice in notice_file_map:
bisect.insort(notice_file_map[notice], filename)
else:
notice_file_map[notice] = [filename]
file_notice_map[filename] = notice
# Sort notices by filenames.
notices = [notice for notice, files in notice_file_map.iteritems()]
def get_key(notice):
return notice_file_map[notice][0]
notices = sorted(notices, key=get_key)
# Stitch together File -> Notice text.
result = ''
for notice in notices:
files = notice_file_map[notice]
if relative_directory:
files = [os.path.relpath(x, relative_directory) for x in files]
result += '\n'.join(files[:-1])
if len(files) > 1:
result += '\n'
last_file_path = files[-1]
padding = _OUTPUT_COLUMNS - 1 - len(last_file_path)
result += '%s %s\n\n' % (last_file_path, '-' * padding)
result += notice
result += '\n\n\n'
return result
def _find_block(lines, start, starter, ender=None):
"""Finds the comment block starting with the given index."""
missing = ender is None
end = len(lines)
for index in xrange(start, len(lines), 1):
if missing:
is_end = not lines[index].strip().startswith(starter)
else:
is_end = ender in lines[index]
if is_end:
end = index + 1
break
return lines[start:end]
def _find_notice(filename):
"""Returns the notice text for the given filename."""
with open(filename, 'r') as f:
lines = [x.strip('\r\n') for x in f.readlines()]
copyright_index = None
for index, line in enumerate(lines):
match = _COPYRIGHT_RE.search(line)
if match:
copyright_index = index
break
if index > _MAXIMUM_COPYRIGHT_ROWS:
break
if copyright_index is None:
logging.debug('No Copyright found: %s', filename)
return ''
copyright_line = lines[copyright_index].strip()
starter = None
ender = None
if copyright_line.startswith('//'):
starter = '//'
elif copyright_line.startswith('#'):
starter = '#'
elif copyright_line.startswith(';'):
starter = ';'
elif copyright_line.startswith('.\\"'):
starter = '.\\"'
else:
starter = '/*'
ender = '*/'
notice_lines = _find_block(lines, copyright_index, starter, ender)
if not notice_lines:
logging.debug('Just a Copyright, no license: %s', filename)
return ''
notice_lines = [_remove_junk(x) for x in notice_lines]
# Rigidly dedent the block to remove any non-meaningful characters.
infinity = float('inf')
minimum_letter_column = infinity
for index, line in enumerate(notice_lines):
match = _MEANINGFUL_LETTER_RE.search(line)
if not match:
notice_lines[index] = ''
continue
start_index = match.start(0)
if start_index < minimum_letter_column:
minimum_letter_column = start_index
if minimum_letter_column < infinity:
for index, line in enumerate(notice_lines):
notice_lines[index] = notice_lines[index][minimum_letter_column:]
# Remove leading and trailing whitespace.
notice_lines = '\n'.join(notice_lines).strip().split('\n')
# Cut off after two blank lines.
blank = False
for index, line in enumerate(notice_lines):
stripped = line.strip()
if not blank:
if not stripped:
blank = True
continue
else:
if not stripped:
notice_lines = notice_lines[0:index - 1]
break
blank = False
# Just a copyright means no license.
if len(notice_lines) <= 2:
logging.debug('Eliding license, too short: %s', filename)
return ''
notice_lines = [x.rstrip(' \t\r\n') for x in notice_lines]
return '\n'.join(notice_lines)
def _create_argument_parser():
"""Create an argument parser for this script."""
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent(__doc__))
parser.add_argument(
'-v', '--verbose', dest='verbose_count',
default=0, action='count',
help='Verbose level (multiple times for more).')
parser.add_argument(
'-q', '--quiet', dest='quiet_count',
default=0, action='count',
help='Quietness level (multiple times for more).')
parser.add_argument(
'-d', '--directory',
default=None,
help='Relative directory for filenames.')
parser.add_argument(
'file',
nargs='+',
type=str,
help='Source file or directory to extract license for.')
return parser
def _setup_logging(default_level=logging.INFO):
"""Initializes logging format."""
logging_level = default_level
logging_format = '%(asctime)s.%(msecs)03d [%(levelname)-8s] %(message)s'
datetime_format = '%H:%M:%S'
logging.basicConfig(level=logging_level, format=logging_format,
datefmt=datetime_format)
_LOG_LEVELS = [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO,
logging.DEBUG,]
def _set_log_level(verbosity, quietness):
"""Sets the log level based on the verbosity and quietness levels."""
log_delta = verbosity - quietness
logger = logging.getLogger()
try:
level_index = _LOG_LEVELS.index(logger.getEffectiveLevel())
except ValueError:
level_index = _LOG_LEVELS.index(logging.INFO)
level_index = min(len(_LOG_LEVELS) - 1, max(0, level_index + log_delta))
logging.getLogger().setLevel(_LOG_LEVELS[level_index])
def main(argv):
_setup_logging()
parser = _create_argument_parser()
options = parser.parse_args(argv)
_set_log_level(options.verbose_count, options.quiet_count)
relative_directory = None
if options.directory:
relative_directory = os.path.abspath(options.directory)
if not os.path.isdir(relative_directory):
relative_directory = None
notices_text = _collect_notices(options.file, relative_directory)
print(notices_text)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))