collect-notices.py - stadia-controller/gcc-arm-none-eabi - Git at Google

 #!/usr/bin/python
 # Copyright (C) 2018 Google Inc. All Rights Reserved.
 """Extracts licenses from a list of source files.

 Outputs a NOTICES files to stdout and a status log to stderr.
 """

 from __future__ import print_function

 import argparse
 import bisect
 import logging
 import os
 import re
 import sys
 import textwrap

 _COPYRIGHT_PATTERN = r'Copyright '
 _COPYRIGHT_RE = re.compile(_COPYRIGHT_PATTERN)

 _MEANINGFUL_LETTER_PATTERN = r'[a-zA-Z()]'
 _MEANINGFUL_LETTER_RE = re.compile(_MEANINGFUL_LETTER_PATTERN)

 _RCS_TAG_PATTERN = r'\$.*\$'
 _RCS_TAG_RE = re.compile(_RCS_TAG_PATTERN)

 _JUNK_PATTERN = r'@\(#\).*$'
 _JUNK_RE = re.compile(_JUNK_PATTERN)

 _JUNK2_PATTERN = r'###COPYRIGHTEND####'
 _JUNK2_RE = re.compile(_JUNK2_PATTERN)

 _JUNK3_PATTERN = r'citrus Id:.*Exp'
 _JUNK3_RE = re.compile(_JUNK3_PATTERN)

 _JUNK4_PATTERN = r'\s*from:\s*$'
 _JUNK4_RE = re.compile(_JUNK4_PATTERN)

 _JUNK5_PATTERN = r'%sccs.include.redist.c%'
 _JUNK5_RE = re.compile(_JUNK5_PATTERN)

 # The maxiumum number of columns to fill in the generated output.
 _OUTPUT_COLUMNS = 79

 # The maximum number of rows from the top of a source file to search for a
 # Copyright statement. Beyond that, it is more likely that the Copyright
 # statement refers to a small piece of embedded code than the whole file.
 _MAXIMUM_COPYRIGHT_ROWS = 200


 def _remove_junk(line):
   clean_line = _RCS_TAG_RE.sub('', line)
   clean_line = _JUNK_RE.sub('', clean_line)
   clean_line = _JUNK2_RE.sub('', clean_line)
   clean_line = _JUNK3_RE.sub('', clean_line)
   clean_line = _JUNK4_RE.sub('', clean_line)
   clean_line = _JUNK5_RE.sub('', clean_line)
   return clean_line


 def _read_lines(filename):
   with open(filename, 'r') as f:
     return [x.rstrip(' \t\r\n') for x in f.readlines()]


 def _read_file_list(filename):
   file_directory = os.path.dirname(filename)
   return [os.path.abspath(os.path.join(file_directory, x)) for x
           in _read_lines(filename)]


 def _expand_directory(path):
   fileset = set()
   for root, _, files in os.walk(path):
     for name in files:
       fileset.add(os.path.join(root, name))
   return fileset


 def _flatten_file_list(files):
   """Flattens the given list of files by walking any specified directories."""
   expanded_files = set()
   file_list_files = [os.path.abspath(x[1:]) for x in files if x.startswith('@')]
   files = [os.path.abspath(x) for x in files if not x.startswith('@')]
   for path in file_list_files:
     files += _read_file_list(path)

   for path in [os.path.abspath(x) for x in files]:
     if os.path.isdir(path):
       expanded_files |= _expand_directory(path)
       continue

     if not os.path.exists(path):
       logging.warning('File not found: %s', path)
       continue

     expanded_files.add(path)

   return expanded_files


 def _collect_notices(files, relative_directory):
   """Collects all the notices from files, returning a combined string."""
   file_notice_map = {}
   notice_file_map = {}
   for filename in sorted(_flatten_file_list(files)):
     notice = _find_notice(filename)
     if not notice:
       continue
     if notice in notice_file_map:
       bisect.insort(notice_file_map[notice], filename)
     else:
       notice_file_map[notice] = [filename]
     file_notice_map[filename] = notice

   # Sort notices by filenames.
   notices = [notice for notice, files in notice_file_map.iteritems()]
   def get_key(notice):
     return notice_file_map[notice][0]
   notices = sorted(notices, key=get_key)

   # Stitch together File -> Notice text.
   result = ''
   for notice in notices:
     files = notice_file_map[notice]
     if relative_directory:
       files = [os.path.relpath(x, relative_directory) for x in files]
     result += '\n'.join(files[:-1])
     if len(files) > 1:
       result += '\n'
     last_file_path = files[-1]
     padding = _OUTPUT_COLUMNS - 1 - len(last_file_path)
     result += '%s %s\n\n' % (last_file_path, '-' * padding)
     result += notice
     result += '\n\n\n'
   return result


 def _find_block(lines, start, starter, ender=None):
   """Finds the comment block starting with the given index."""
   missing = ender is None
   end = len(lines)

   for index in xrange(start, len(lines), 1):
     if missing:
       is_end = not lines[index].strip().startswith(starter)
     else:
       is_end = ender in lines[index]
     if is_end:
       end = index + 1
       break

   return lines[start:end]


 def _find_notice(filename):
   """Returns the notice text for the given filename."""

   with open(filename, 'r') as f:
     lines = [x.strip('\r\n') for x in f.readlines()]

   copyright_index = None
   for index, line in enumerate(lines):
     match = _COPYRIGHT_RE.search(line)
     if match:
       copyright_index = index
       break
     if index > _MAXIMUM_COPYRIGHT_ROWS:
       break

   if copyright_index is None:
     logging.debug('No Copyright found: %s', filename)
     return ''

   copyright_line = lines[copyright_index].strip()
   starter = None
   ender = None
   if copyright_line.startswith('//'):
     starter = '//'
   elif copyright_line.startswith('#'):
     starter = '#'
   elif copyright_line.startswith(';'):
     starter = ';'
   elif copyright_line.startswith('.\\"'):
     starter = '.\\"'
   else:
     starter = '/*'
     ender = '*/'

   notice_lines = _find_block(lines, copyright_index, starter, ender)

   if not notice_lines:
     logging.debug('Just a Copyright, no license: %s', filename)
     return ''

   notice_lines = [_remove_junk(x) for x in notice_lines]

   # Rigidly dedent the block to remove any non-meaningful characters.
   infinity = float('inf')
   minimum_letter_column = infinity
   for index, line in enumerate(notice_lines):
     match = _MEANINGFUL_LETTER_RE.search(line)
     if not match:
       notice_lines[index] = ''
       continue
     start_index = match.start(0)
     if start_index < minimum_letter_column:
       minimum_letter_column = start_index

   if minimum_letter_column < infinity:
     for index, line in enumerate(notice_lines):
       notice_lines[index] = notice_lines[index][minimum_letter_column:]

   # Remove leading and trailing whitespace.
   notice_lines = '\n'.join(notice_lines).strip().split('\n')

   # Cut off after two blank lines.
   blank = False
   for index, line in enumerate(notice_lines):
     stripped = line.strip()
     if not blank:
       if not stripped:
         blank = True
         continue
     else:
       if not stripped:
         notice_lines = notice_lines[0:index - 1]
         break
       blank = False

   # Just a copyright means no license.
   if len(notice_lines) <= 2:
     logging.debug('Eliding license, too short: %s', filename)
     return ''

   notice_lines = [x.rstrip(' \t\r\n') for x in notice_lines]
   return '\n'.join(notice_lines)


 def _create_argument_parser():
   """Create an argument parser for this script."""
   parser = argparse.ArgumentParser(
       formatter_class=argparse.RawDescriptionHelpFormatter,
       description=textwrap.dedent(__doc__))
   parser.add_argument(
       '-v', '--verbose', dest='verbose_count',
       default=0, action='count',
       help='Verbose level (multiple times for more).')
   parser.add_argument(
       '-q', '--quiet', dest='quiet_count',
       default=0, action='count',
       help='Quietness level (multiple times for more).')
   parser.add_argument(
       '-d', '--directory',
       default=None,
       help='Relative directory for filenames.')
   parser.add_argument(
       'file',
       nargs='+',
       type=str,
       help='Source file or directory to extract license for.')
   return parser


 def _setup_logging(default_level=logging.INFO):
   """Initializes logging format."""
   logging_level = default_level
   logging_format = '%(asctime)s.%(msecs)03d [%(levelname)-8s] %(message)s'
   datetime_format = '%H:%M:%S'
   logging.basicConfig(level=logging_level, format=logging_format,
                       datefmt=datetime_format)


 _LOG_LEVELS = [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO,
                logging.DEBUG,]


 def _set_log_level(verbosity, quietness):
   """Sets the log level based on the verbosity and quietness levels."""
   log_delta = verbosity - quietness
   logger = logging.getLogger()
   try:
     level_index = _LOG_LEVELS.index(logger.getEffectiveLevel())
   except ValueError:
     level_index = _LOG_LEVELS.index(logging.INFO)

   level_index = min(len(_LOG_LEVELS) - 1, max(0, level_index + log_delta))
   logging.getLogger().setLevel(_LOG_LEVELS[level_index])


 def main(argv):
   _setup_logging()
   parser = _create_argument_parser()
   options = parser.parse_args(argv)
   _set_log_level(options.verbose_count, options.quiet_count)

   relative_directory = None
   if options.directory:
     relative_directory = os.path.abspath(options.directory)
     if not os.path.isdir(relative_directory):
       relative_directory = None

   notices_text = _collect_notices(options.file, relative_directory)
   print(notices_text)

   return 0


 if __name__ == '__main__':
   sys.exit(main(sys.argv[1:]))
	#!/usr/bin/python
	# Copyright (C) 2018 Google Inc. All Rights Reserved.
	"""Extracts licenses from a list of source files.

	Outputs a NOTICES files to stdout and a status log to stderr.
	"""

	from __future__ import print_function

	import argparse
	import bisect
	import logging
	import os
	import re
	import sys
	import textwrap

	_COPYRIGHT_PATTERN = r'Copyright '
	_COPYRIGHT_RE = re.compile(_COPYRIGHT_PATTERN)

	_MEANINGFUL_LETTER_PATTERN = r'[a-zA-Z()]'
	_MEANINGFUL_LETTER_RE = re.compile(_MEANINGFUL_LETTER_PATTERN)

	_RCS_TAG_PATTERN = r'\$.*\$'
	_RCS_TAG_RE = re.compile(_RCS_TAG_PATTERN)

	_JUNK_PATTERN = r'@\(#\).*$'
	_JUNK_RE = re.compile(_JUNK_PATTERN)

	_JUNK2_PATTERN = r'###COPYRIGHTEND####'
	_JUNK2_RE = re.compile(_JUNK2_PATTERN)

	_JUNK3_PATTERN = r'citrus Id:.*Exp'
	_JUNK3_RE = re.compile(_JUNK3_PATTERN)

	_JUNK4_PATTERN = r'\sfrom:\s$'
	_JUNK4_RE = re.compile(_JUNK4_PATTERN)

	_JUNK5_PATTERN = r'%sccs.include.redist.c%'
	_JUNK5_RE = re.compile(_JUNK5_PATTERN)

	# The maxiumum number of columns to fill in the generated output.
	_OUTPUT_COLUMNS = 79

	# The maximum number of rows from the top of a source file to search for a
	# Copyright statement. Beyond that, it is more likely that the Copyright
	# statement refers to a small piece of embedded code than the whole file.
	_MAXIMUM_COPYRIGHT_ROWS = 200


	def _remove_junk(line):
	clean_line = _RCS_TAG_RE.sub('', line)
	clean_line = _JUNK_RE.sub('', clean_line)
	clean_line = _JUNK2_RE.sub('', clean_line)
	clean_line = _JUNK3_RE.sub('', clean_line)
	clean_line = _JUNK4_RE.sub('', clean_line)
	clean_line = _JUNK5_RE.sub('', clean_line)
	return clean_line


	def _read_lines(filename):
	with open(filename, 'r') as f:
	return [x.rstrip(' \t\r\n') for x in f.readlines()]


	def _read_file_list(filename):
	file_directory = os.path.dirname(filename)
	return [os.path.abspath(os.path.join(file_directory, x)) for x
	in _read_lines(filename)]


	def _expand_directory(path):
	fileset = set()
	for root, _, files in os.walk(path):
	for name in files:
	fileset.add(os.path.join(root, name))
	return fileset


	def _flatten_file_list(files):
	"""Flattens the given list of files by walking any specified directories."""
	expanded_files = set()
	file_list_files = [os.path.abspath(x[1:]) for x in files if x.startswith('@')]
	files = [os.path.abspath(x) for x in files if not x.startswith('@')]
	for path in file_list_files:
	files += _read_file_list(path)

	for path in [os.path.abspath(x) for x in files]:
	if os.path.isdir(path):
	expanded_files \|= _expand_directory(path)
	continue

	if not os.path.exists(path):
	logging.warning('File not found: %s', path)
	continue

	expanded_files.add(path)

	return expanded_files


	def _collect_notices(files, relative_directory):
	"""Collects all the notices from files, returning a combined string."""
	file_notice_map = {}
	notice_file_map = {}
	for filename in sorted(_flatten_file_list(files)):
	notice = _find_notice(filename)
	if not notice:
	continue
	if notice in notice_file_map:
	bisect.insort(notice_file_map[notice], filename)
	else:
	notice_file_map[notice] = [filename]
	file_notice_map[filename] = notice

	# Sort notices by filenames.
	notices = [notice for notice, files in notice_file_map.iteritems()]
	def get_key(notice):
	return notice_file_map[notice][0]
	notices = sorted(notices, key=get_key)

	# Stitch together File -> Notice text.
	result = ''
	for notice in notices:
	files = notice_file_map[notice]
	if relative_directory:
	files = [os.path.relpath(x, relative_directory) for x in files]
	result += '\n'.join(files[:-1])
	if len(files) > 1:
	result += '\n'
	last_file_path = files[-1]
	padding = _OUTPUT_COLUMNS - 1 - len(last_file_path)
	result += '%s %s\n\n' % (last_file_path, '-' * padding)
	result += notice
	result += '\n\n\n'
	return result


	def _find_block(lines, start, starter, ender=None):
	"""Finds the comment block starting with the given index."""
	missing = ender is None
	end = len(lines)

	for index in xrange(start, len(lines), 1):
	if missing:
	is_end = not lines[index].strip().startswith(starter)
	else:
	is_end = ender in lines[index]
	if is_end:
	end = index + 1
	break

	return lines[start:end]


	def _find_notice(filename):
	"""Returns the notice text for the given filename."""

	with open(filename, 'r') as f:
	lines = [x.strip('\r\n') for x in f.readlines()]

	copyright_index = None
	for index, line in enumerate(lines):
	match = _COPYRIGHT_RE.search(line)
	if match:
	copyright_index = index
	break
	if index > _MAXIMUM_COPYRIGHT_ROWS:
	break

	if copyright_index is None:
	logging.debug('No Copyright found: %s', filename)
	return ''

	copyright_line = lines[copyright_index].strip()
	starter = None
	ender = None
	if copyright_line.startswith('//'):
	starter = '//'
	elif copyright_line.startswith('#'):
	starter = '#'
	elif copyright_line.startswith(';'):
	starter = ';'
	elif copyright_line.startswith('.\\"'):
	starter = '.\\"'
	else:
	starter = '/*'
	ender = '*/'

	notice_lines = _find_block(lines, copyright_index, starter, ender)

	if not notice_lines:
	logging.debug('Just a Copyright, no license: %s', filename)
	return ''

	notice_lines = [_remove_junk(x) for x in notice_lines]

	# Rigidly dedent the block to remove any non-meaningful characters.
	infinity = float('inf')
	minimum_letter_column = infinity
	for index, line in enumerate(notice_lines):
	match = _MEANINGFUL_LETTER_RE.search(line)
	if not match:
	notice_lines[index] = ''
	continue
	start_index = match.start(0)
	if start_index < minimum_letter_column:
	minimum_letter_column = start_index

	if minimum_letter_column < infinity:
	for index, line in enumerate(notice_lines):
	notice_lines[index] = notice_lines[index][minimum_letter_column:]

	# Remove leading and trailing whitespace.
	notice_lines = '\n'.join(notice_lines).strip().split('\n')

	# Cut off after two blank lines.
	blank = False
	for index, line in enumerate(notice_lines):
	stripped = line.strip()
	if not blank:
	if not stripped:
	blank = True
	continue
	else:
	if not stripped:
	notice_lines = notice_lines[0:index - 1]
	break
	blank = False

	# Just a copyright means no license.
	if len(notice_lines) <= 2:
	logging.debug('Eliding license, too short: %s', filename)
	return ''

	notice_lines = [x.rstrip(' \t\r\n') for x in notice_lines]
	return '\n'.join(notice_lines)


	def _create_argument_parser():
	"""Create an argument parser for this script."""
	parser = argparse.ArgumentParser(
	formatter_class=argparse.RawDescriptionHelpFormatter,
	description=textwrap.dedent(__doc__))
	parser.add_argument(
	'-v', '--verbose', dest='verbose_count',
	default=0, action='count',
	help='Verbose level (multiple times for more).')
	parser.add_argument(
	'-q', '--quiet', dest='quiet_count',
	default=0, action='count',
	help='Quietness level (multiple times for more).')
	parser.add_argument(
	'-d', '--directory',
	default=None,
	help='Relative directory for filenames.')
	parser.add_argument(
	'file',
	nargs='+',
	type=str,
	help='Source file or directory to extract license for.')
	return parser


	def _setup_logging(default_level=logging.INFO):
	"""Initializes logging format."""
	logging_level = default_level
	logging_format = '%(asctime)s.%(msecs)03d [%(levelname)-8s] %(message)s'
	datetime_format = '%H:%M:%S'
	logging.basicConfig(level=logging_level, format=logging_format,
	datefmt=datetime_format)


	_LOG_LEVELS = [logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO,
	logging.DEBUG,]


	def _set_log_level(verbosity, quietness):
	"""Sets the log level based on the verbosity and quietness levels."""
	log_delta = verbosity - quietness
	logger = logging.getLogger()
	try:
	level_index = _LOG_LEVELS.index(logger.getEffectiveLevel())
	except ValueError:
	level_index = _LOG_LEVELS.index(logging.INFO)

	level_index = min(len(_LOG_LEVELS) - 1, max(0, level_index + log_delta))
	logging.getLogger().setLevel(_LOG_LEVELS[level_index])


	def main(argv):
	_setup_logging()
	parser = _create_argument_parser()
	options = parser.parse_args(argv)
	_set_log_level(options.verbose_count, options.quiet_count)

	relative_directory = None
	if options.directory:
	relative_directory = os.path.abspath(options.directory)
	if not os.path.isdir(relative_directory):
	relative_directory = None

	notices_text = _collect_notices(options.file, relative_directory)
	print(notices_text)

	return 0


	if __name__ == '__main__':
	sys.exit(main(sys.argv[1:]))