| #!/usr/bin/python |
| # Copyright (C) 2018 Google Inc. All Rights Reserved. |
| """Extracts licenses from a list of source files. |
| |
| Outputs a NOTICES files to stdout and a status log to stderr. |
| """ |
| |
| from __future__ import print_function |
| |
| import argparse |
| import bisect |
| import logging |
| import os |
| import re |
| import sys |
| import textwrap |
| |
| _COPYRIGHT_PATTERN = r'Copyright ' |
| _COPYRIGHT_RE = re.compile(_COPYRIGHT_PATTERN) |
| |
| _MEANINGFUL_LETTER_PATTERN = r'[a-zA-Z()]' |
| _MEANINGFUL_LETTER_RE = re.compile(_MEANINGFUL_LETTER_PATTERN) |
| |
| _RCS_TAG_PATTERN = r'\$.*\$' |
| _RCS_TAG_RE = re.compile(_RCS_TAG_PATTERN) |
| |
| _JUNK_PATTERN = r'@\(#\).*$' |
| _JUNK_RE = re.compile(_JUNK_PATTERN) |
| |
| _JUNK2_PATTERN = r'###COPYRIGHTEND####' |
| _JUNK2_RE = re.compile(_JUNK2_PATTERN) |
| |
| _JUNK3_PATTERN = r'citrus Id:.*Exp' |
| _JUNK3_RE = re.compile(_JUNK3_PATTERN) |
| |
| _JUNK4_PATTERN = r'\s*from:\s*$' |
| _JUNK4_RE = re.compile(_JUNK4_PATTERN) |
| |
| _JUNK5_PATTERN = r'%sccs.include.redist.c%' |
| _JUNK5_RE = re.compile(_JUNK5_PATTERN) |
| |
| # The maxiumum number of columns to fill in the generated output. |
| _OUTPUT_COLUMNS = 79 |
| |
| # The maximum number of rows from the top of a source file to search for a |
| # Copyright statement. Beyond that, it is more likely that the Copyright |
| # statement refers to a small piece of embedded code than the whole file. |
| _MAXIMUM_COPYRIGHT_ROWS = 200 |
| |
| |
| def _remove_junk(line): |
| clean_line = _RCS_TAG_RE.sub('', line) |
| clean_line = _JUNK_RE.sub('', clean_line) |
| clean_line = _JUNK2_RE.sub('', clean_line) |
| clean_line = _JUNK3_RE.sub('', clean_line) |
| clean_line = _JUNK4_RE.sub('', clean_line) |
| clean_line = _JUNK5_RE.sub('', clean_line) |
| return clean_line |
| |
| |
| def _read_lines(filename): |
| with open(filename, 'r') as f: |
| return [x.rstrip(' \t\r\n') for x in f.readlines()] |
| |
| |
| def _read_file_list(filename): |
| file_directory = os.path.dirname(filename) |
| return [os.path.abspath(os.path.join(file_directory, x)) for x |
| in _read_lines(filename)] |
| |
| |
| def _expand_directory(path): |
| fileset = set() |
| for root, _, files in os.walk(path): |
| for name in files: |
| fileset.add(os.path.join(root, name)) |
| return fileset |
| |
| |
| def _flatten_file_list(files): |
| """Flattens the given list of files by walking any specified directories.""" |
| expanded_files = set() |
| file_list_files = [os.path.abspath(x[1:]) for x in files if x.startswith('@')] |
| files = [os.path.abspath(x) for x in files if not x.startswith('@')] |
| for path in file_list_files: |
| files += _read_file_list(path) |
| |
| for path in [os.path.abspath(x) for x in files]: |
| if os.path.isdir(path): |
| expanded_files |= _expand_directory(path) |
| continue |
| |
| if not os.path.exists(path): |
| logging.warning('File not found: %s', path) |
| continue |
| |
| expanded_files.add(path) |
| |
| return expanded_files |
| |
| |
| def _collect_notices(files, relative_directory): |
| """Collects all the notices from files, returning a combined string.""" |
| file_notice_map = {} |
| notice_file_map = {} |
| for filename in sorted(_flatten_file_list(files)): |
| notice = _find_notice(filename) |
| if not notice: |
| continue |
| if notice in notice_file_map: |
| bisect.insort(notice_file_map[notice], filename) |
| else: |
| notice_file_map[notice] = [filename] |
| file_notice_map[filename] = notice |
| |
| # Sort notices by filenames. |
| notices = [notice for notice, files in notice_file_map.iteritems()] |
| def get_key(notice): |
| return notice_file_map[notice][0] |
| notices = sorted(notices, key=get_key) |
| |
| # Stitch together File -> Notice text. |
| result = '' |
| for notice in notices: |
| files = notice_file_map[notice] |
| if relative_directory: |
| files = [os.path.relpath(x, relative_directory) for x in files] |
| result += '\n'.join(files[:-1]) |
| if len(files) > 1: |
| result += '\n' |
| last_file_path = files[-1] |
| padding = _OUTPUT_COLUMNS - 1 - len(last_file_path) |
| result += '%s %s\n\n' % (last_file_path, '-' * padding) |
| result += notice |
| result += '\n\n\n' |
| return result |
| |
| |
| def _find_block(lines, start, starter, ender=None): |
| """Finds the comment block starting with the given index.""" |
| missing = ender is None |
| end = len(lines) |
| |
| for index in xrange(start, len(lines), 1): |
| if missing: |
| is_end = not lines[index].strip().startswith(starter) |
| else: |
| is_end = ender in lines[index] |
| if is_end: |
| end = index + 1 |
| break |
| |
| return lines[start:end] |
| |
| |
| def _find_notice(filename): |
| """Returns the notice text for the given filename.""" |
| |
| with open(filename, 'r') as f: |
| lines = [x.strip('\r\n') for x in f.readlines()] |
| |
| copyright_index = None |
| for index, line in enumerate(lines): |
| match = _COPYRIGHT_RE.search(line) |
| if match: |
| copyright_index = index |
| break |
| if index > _MAXIMUM_COPYRIGHT_ROWS: |
| break |
| |
| if copyright_index is None: |
| logging.debug('No Copyright found: %s', filename) |
| return '' |
| |
| copyright_line = lines[copyright_index].strip() |
| starter = None |
| ender = None |
| if copyright_line.startswith('//'): |
| starter = '//' |
| elif copyright_line.startswith('#'): |
| starter = '#' |
| elif copyright_line.startswith(';'): |
| starter = ';' |
| elif copyright_line.startswith('.\\"'): |
| starter = '.\\"' |
| else: |
| starter = '/*' |
| ender = '*/' |
| |
| notice_lines = _find_block(lines, copyright_index, starter, ender) |
| |
| if not notice_lines: |
| logging.debug('Just a Copyright, no license: %s', filename) |
| return '' |
| |
| notice_lines = [_remove_junk(x) for x in notice_lines] |
| |
| # Rigidly dedent the block to remove any non-meaningful characters. |
| infinity = float('inf') |
| minimum_letter_column = infinity |
| for index, line in enumerate(notice_lines): |
| match = _MEANINGFUL_LETTER_RE.search(line) |
| if not match: |
| notice_lines[index] = '' |
| continue |
| start_index = match.start(0) |
| if start_index < minimum_letter_column: |
| minimum_letter_column = start_index |
| |
| if minimum_letter_column < infinity: |
| for index, line in enumerate(notice_lines): |
| notice_lines[index] = notice_lines[index][minimum_letter_column:] |
| |
| # Remove leading and trailing whitespace. |
| notice_lines = '\n'.join(notice_lines).strip().split('\n') |
| |
| # Cut off after two blank lines. |
| blank = False |
| for index, line in enumerate(notice_lines): |
| stripped = line.strip() |
| if not blank: |
| if not stripped: |
| blank = True |
| continue |
| else: |
| if not stripped: |
| notice_lines = notice_lines[0:index - 1] |
| break |
| blank = False |
| |
| # Just a copyright means no license. |
| if len(notice_lines) <= 2: |
| logging.debug('Eliding license, too short: %s', filename) |
| return '' |
| |
| notice_lines = [x.rstrip(' \t\r\n') for x in notice_lines] |
| return '\n'.join(notice_lines) |
| |
| |
| def _create_argument_parser(): |
| """Create an argument parser for this script.""" |
| parser = argparse.ArgumentParser( |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| description=textwrap.dedent(__doc__)) |
| parser.add_argument( |
| '-v', '--verbose', dest='verbose_count', |
| default=0, action='count', |
| help='Verbose level (multiple times for more).') |
| parser.add_argument( |
| '-q', '--quiet', dest='quiet_count', |
| default=0, action='count', |
| help='Quietness level (multiple times for more).') |
| parser.add_argument( |
| '-d', '--directory', |
| default=None, |
| help='Relative directory for filenames.') |
| parser.add_argument( |
| 'file', |
| nargs='+', |
| type=str, |
| help='Source file or directory to extract license for.') |
| return parser |
| |
| |
| def _setup_logging(default_level=logging.INFO): |
| """Initializes logging format.""" |
| logging_level = default_level |
| logging_format = '%(asctime)s.%(msecs)03d [%(levelname)-8s] %(message)s' |
| datetime_format = '%H:%M:%S' |
| logging.basicConfig(level=logging_level, format=logging_format, |
| datefmt=datetime_format) |
| |
| |
| _LOG_LEVELS = [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, |
| logging.DEBUG,] |
| |
| |
| def _set_log_level(verbosity, quietness): |
| """Sets the log level based on the verbosity and quietness levels.""" |
| log_delta = verbosity - quietness |
| logger = logging.getLogger() |
| try: |
| level_index = _LOG_LEVELS.index(logger.getEffectiveLevel()) |
| except ValueError: |
| level_index = _LOG_LEVELS.index(logging.INFO) |
| |
| level_index = min(len(_LOG_LEVELS) - 1, max(0, level_index + log_delta)) |
| logging.getLogger().setLevel(_LOG_LEVELS[level_index]) |
| |
| |
| def main(argv): |
| _setup_logging() |
| parser = _create_argument_parser() |
| options = parser.parse_args(argv) |
| _set_log_level(options.verbose_count, options.quiet_count) |
| |
| relative_directory = None |
| if options.directory: |
| relative_directory = os.path.abspath(options.directory) |
| if not os.path.isdir(relative_directory): |
| relative_directory = None |
| |
| notices_text = _collect_notices(options.file, relative_directory) |
| print(notices_text) |
| |
| return 0 |
| |
| |
| if __name__ == '__main__': |
| sys.exit(main(sys.argv[1:])) |