chromium/src/third_party/blink/tools/blinkpy/web_tests/controllers/single_test_runner.py - manifest_repos/chromium_src - Git at Google

 # Copyright (C) 2011 Google Inc. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 #     * Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above
 # copyright notice, this list of conditions and the following disclaimer
 # in the documentation and/or other materials provided with the
 # distribution.
 #     * Neither the name of Google Inc. nor the names of its
 # contributors may be used to endorse or promote products derived from
 # this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 import hashlib
 import logging
 import re

 from blinkpy.web_tests.port.driver import DeviceFailure, DriverInput, DriverOutput
 from blinkpy.web_tests.models import test_failures
 from blinkpy.web_tests.models.test_results import TestResult, build_test_result
 from blinkpy.web_tests.models import testharness_results

 _log = logging.getLogger(__name__)


 def run_single_test(port, options, results_directory, worker_name, driver,
                     test_input):
     runner = SingleTestRunner(port, options, results_directory, worker_name,
                               driver, test_input)
     try:
         test_result = runner.run()
         if not options.no_expectations:
             test_result.create_artifacts()
         return test_result
     except DeviceFailure as error:
         _log.error('device failed: %s', error)
         return TestResult(
             test_input.test_name, device_failed=True,
             failures=[test_failures.FailureEarlyExit()])


 class SingleTestRunner(object):
     def __init__(self, port, options, results_directory, worker_name, driver,
                  test_input):
         self._port = port
         self._filesystem = port.host.filesystem
         self._options = options
         self._results_directory = results_directory
         self._driver = driver
         self._timeout_ms = test_input.timeout_ms
         self._worker_name = worker_name
         self._test_name = test_input.test_name
         self._reference_files = test_input.reference_files
         self._retry_attempt = test_input.retry_attempt

         test_failures.AbstractTestResultType.port = port
         test_failures.AbstractTestResultType.test_name = test_input.test_name
         test_failures.AbstractTestResultType.result_directory = results_directory
         test_failures.AbstractTestResultType.filesystem = self._filesystem
         TestResult.repeat_tests = (self._options.watch
                                    or self._options.repeat_each > 1
                                    or self._options.iterations > 1)
         TestResult.results_directory = self._results_directory
         TestResult.filesystem = port.host.filesystem

     def _expected_driver_output(self):
         return DriverOutput(
             self._port.expected_text(self._test_name),
             self._port.expected_image(self._test_name),
             self._port.expected_checksum(self._test_name),
             self._port.expected_audio(self._test_name))

     def _should_fetch_expected_checksum(self):
         return not self._options.reset_results

     def _driver_input(self):
         # The image hash is used to avoid doing an image dump if the
         # checksums match, so it should be set to a blank value if we
         # are generating a new baseline.  (Otherwise, an image from a
         # previous run will be copied into the baseline.)
         image_hash = None
         if self._should_fetch_expected_checksum():
             image_hash = self._port.expected_checksum(self._test_name)

         args = self._port.args_for_test(self._test_name)
         test_name = self._port.name_for_test(self._test_name)
         return DriverInput(test_name, self._timeout_ms, image_hash, args)

     def run(self):
         # WPT crash tests do not have baselines, so even when re-baselining we
         # run them as normal.
         if (self._options.enable_sanitizer
                 or self._port.is_wpt_crash_test(self._test_name)):
             return self._run_crash_test()
         if self._options.reset_results or self._options.copy_baselines:
             return self._run_rebaseline()
         if self._reference_files:
             return self._run_reftest()
         return self._run_compare_test()

     def _run_crash_test(self):
         # running a crash test means that we ignore the actual test output and just look
         # for timeouts and crashes (real or forced by the driver). Most crashes should
         # indicate problems found by a sanitizer (ASAN, LSAN, etc.), but we will report
         # on other crashes and timeouts as well.
         driver_output = self._driver.run_test(self._driver_input())
         expected_driver_output = self._expected_driver_output()
         failures = self._handle_error(driver_output)

         test_result = build_test_result(
             driver_output,
             self._test_name,
             retry_attempt=self._retry_attempt,
             failures=failures,
             test_run_time=driver_output.test_time,
             pid=driver_output.pid,
             crash_site=driver_output.crash_site)
         return test_result

     def _run_compare_test(self):
         """Runs the signle test and returns test result."""
         driver_output = self._driver.run_test(self._driver_input())
         expected_driver_output = self._expected_driver_output()
         failures = self._compare_output(expected_driver_output, driver_output)

         return build_test_result(
             driver_output,
             self._test_name,
             retry_attempt=self._retry_attempt,
             failures=failures,
             test_run_time=driver_output.test_time,
             pid=driver_output.pid,
             crash_site=driver_output.crash_site)

     def _run_rebaseline(self):
         """Similar to _run_compare_test(), but has the side effect of updating
         or adding baselines. This is called when --reset-results and/or
         --copy-baselines are specified in the command line. If --reset-results,
         in the returned result we treat baseline mismatch as success."""
         driver_output = self._driver.run_test(self._driver_input())
         expected_driver_output = self._expected_driver_output()

         all_failures = self._compare_output(expected_driver_output,
                                             driver_output)

         if self._options.reset_results:
             # Only report leaks, timeouts and crashes and treat all other
             # failures as successes
             reported_failures = self._handle_error(driver_output)
         else:
             # Return comparison failures between the baseline and the actual
             # output as well as leaks, timeouts and crashes
             reported_failures = all_failures

         self._update_or_add_new_baselines(driver_output, all_failures)

         return build_test_result(
             driver_output,
             self._test_name,
             retry_attempt=self._retry_attempt,
             failures=reported_failures,
             test_run_time=driver_output.test_time,
             pid=driver_output.pid,
             crash_site=driver_output.crash_site)

     def _update_or_add_new_baselines(self, driver_output, failures):
         """Updates or adds new baselines for the test if necessary."""
         if (test_failures.has_failure_type(test_failures.FailureTimeout,
                                            failures)
                 or test_failures.has_failure_type(test_failures.FailureCrash,
                                                   failures)):
             return
         # We usually don't want to create a new baseline if there isn't one
         # existing (which usually means this baseline isn't necessary, e.g.
         # an image-only test without text expectation files). However, in the
         # following cases, we do:
         # 1. The failure is MISSING; a baseline is apparently needed.
         # 2. A testharness.js test fails assertions: testharness.js tests
         #    without baselines are implicitly expected to pass all assertions;
         #    if there are failed assertions we need to create a new baseline.
         #    Note that the created baseline might be redundant, but users can
         #    optimize them later with optimize-baselines.
         if self._is_all_pass_testharness_text_not_needing_baseline(
                 driver_output.text):
             driver_output.text = None
         self._save_baseline_data(
             driver_output.text, '.txt',
             test_failures.has_failure_type(test_failures.FailureMissingResult,
                                            failures)
             or test_failures.has_failure_type(
                 test_failures.FailureTestHarnessAssertion, failures))
         self._save_baseline_data(
             driver_output.audio, '.wav',
             test_failures.has_failure_type(test_failures.FailureMissingAudio,
                                            failures))

         expected_png = driver_output.image
         if self._reference_files:
             _log.warning('Can not rebaseline the image baseline of reftest %s',
                          self._test_name)
             # Let _save_baseline_data remove the '-expected.png' if it exists.
             expected_png = None
         self._save_baseline_data(
             expected_png, '.png',
             test_failures.has_failure_type(test_failures.FailureMissingImage,
                                            failures))

     def _save_baseline_data(self, data, extension, force_create_new_baseline):
         port = self._port
         fs = self._filesystem

         # Do not create a new baseline unless we are specifically told so.
         current_expected_path = port.expected_filename(
             self._test_name, extension, return_default=False)
         if not current_expected_path and not force_create_new_baseline:
             return

         flag_specific_dir = port.baseline_flag_specific_dir()
         if flag_specific_dir:
             output_dir = fs.join(flag_specific_dir,
                                  fs.dirname(self._test_name))
         elif self._options.copy_baselines:
             output_dir = fs.join(port.baseline_version_dir(),
                                  fs.dirname(self._test_name))
         else:
             output_dir = fs.dirname(
                 port.expected_filename(
                     self._test_name,
                     extension,
                     fallback_base_for_virtual=False))

         fs.maybe_make_directory(output_dir)
         output_basename = fs.basename(
             fs.splitext(self._test_name)[0] + '-expected' + extension)
         output_path = fs.join(output_dir, output_basename)

         # Remove |output_path| if it exists and is not the generic expectation to
         # avoid extra baseline if the new baseline is the same as the fallback baseline.
         generic_dir = fs.join(
             port.web_tests_dir(),
             fs.dirname(
                 port.lookup_virtual_test_base(self._test_name)
                 or self._test_name))
         if (not data or output_dir != generic_dir) and fs.exists(output_path):
             _log.info('Removing the current baseline "%s"',
                       port.relative_test_filename(output_path))
             fs.remove(output_path)

         # Note that current_expected_path may change because of the above file removal.
         current_expected_path = port.expected_filename(
             self._test_name, extension, return_default=False)
         data = data or ''
         if (current_expected_path and fs.sha1(current_expected_path) ==
                 hashlib.sha1(data).hexdigest()):
             if self._options.reset_results:
                 _log.info(
                     'Not writing new baseline "%s" because it is the same as the current baseline',
                     port.relative_test_filename(output_path))
             else:
                 _log.info(
                     'Not copying baseline to "%s" because the actual result is the same as the current baseline',
                     port.relative_test_filename(output_path))
             return

         if not data and not current_expected_path:
             _log.info(
                 'Not creating new baseline because the test does not need it')
             return
         # If the data is empty and the fallback exists, we'll continue to create
         # an empty baseline file to override the fallback baseline.

         if self._options.reset_results:
             _log.info('Writing new baseline "%s"',
                       port.relative_test_filename(output_path))
             port.update_baseline(output_path, data)
         else:
             _log.info('Copying baseline to "%s"',
                       port.relative_test_filename(output_path))
             if fs.exists(current_expected_path):
                 fs.copyfile(current_expected_path, output_path)
             else:
                 _log.error(
                     'Could not copy baseline to "%s" from "%s" because the source file does not exist',
                     port.relative_test_filename(output_path),
                     current_expected_path)

     def _handle_error(self, driver_output, reference_filename=None):
         """Returns test failures if some unusual errors happen in driver's run.

         Args:
           driver_output: The output from the driver.
           reference_filename: The full path to the reference file which produced
                the driver_output. This arg is optional and should be used only in
                reftests until we have a better way to know which html file
                is used for producing the driver_output.
         """
         failures = []
         if driver_output.timeout:
             failures.append(
                 test_failures.FailureTimeout(driver_output,
                                              bool(reference_filename)))

         if reference_filename:
             testname = self._port.relative_test_filename(reference_filename)
         else:
             testname = self._test_name

         if driver_output.crash:
             failures.append(
                 test_failures.FailureCrash(
                     driver_output,
                     is_reftest=bool(reference_filename),
                     process_name=driver_output.crashed_process_name,
                     pid=driver_output.crashed_pid,
                     has_log=self._port.output_contains_sanitizer_messages(
                         driver_output.crash_log)))
             if driver_output.error:
                 _log.debug('%s %s crashed, (stderr lines):', self._worker_name,
                            testname)
             else:
                 _log.debug('%s %s crashed, (no stderr)', self._worker_name,
                            testname)
         elif driver_output.leak:
             failures.append(
                 test_failures.FailureLeak(driver_output,
                                           bool(reference_filename)))
             _log.debug('%s %s leaked', self._worker_name, testname)
         elif driver_output.error:
             _log.debug('%s %s output stderr lines:', self._worker_name,
                        testname)
         for line in driver_output.error.splitlines():
             _log.debug('  %s', line)
         return failures

     def _compare_output(self, expected_driver_output, driver_output):
         failures = []
         failures.extend(self._handle_error(driver_output))

         if driver_output.crash:
             # Don't continue any more if we already have a crash.
             # In case of timeouts, we continue since we still want
             # to see the text and image output.
             return failures

         failures.extend(
             self._check_extra_and_missing_baselines(expected_driver_output,
                                                     driver_output))

         testharness_completed, testharness_failures = self._compare_testharness_test(
             expected_driver_output, driver_output)
         if testharness_completed:
             failures.extend(testharness_failures)
         else:
             failures.extend(
                 self._compare_text(expected_driver_output, driver_output))
             failures.extend(
                 self._compare_image(expected_driver_output, driver_output))
             failures.extend(
                 self._compare_audio(expected_driver_output, driver_output))
         return failures

     def _report_extra_baseline(self, driver_output, extension, message):
         """If the baseline file exists, logs an error and returns True."""
         if driver_output.crash or driver_output.timeout:
             return False
         # If the baseline overrides a fallback one, we need the empty file to
         # match the empty result.
         if self._port.fallback_expected_filename(self._test_name, extension):
             return False

         expected_file = self._port.expected_filename(
             self._test_name, extension, return_default=False)
         if expected_file:
             _log.error(
                 '%s %s, but has an extra baseline file. Please remove %s' %
                 (self._test_name, message, expected_file))
             return True
         return False

     def _is_all_pass_testharness_text_not_needing_baseline(self, text_result):
         return (
             text_result
             and testharness_results.is_all_pass_testharness_result(text_result)
             and
             # An all-pass testharness test doesn't need the test baseline unless
             # if it is overriding a fallback one.
             not self._port.fallback_expected_filename(self._test_name, '.txt'))

     def _check_extra_and_missing_baselines(self, expected_driver_output,
                                            driver_output):
         failures = []

         if driver_output.text:
             if self._is_all_pass_testharness_text_not_needing_baseline(
                     driver_output.text):
                 if self._report_extra_baseline(
                         driver_output, '.txt',
                         'is a all-pass testharness test'):
                     # TODO(wangxianzhu): Make this a failure.
                     pass
             elif testharness_results.is_testharness_output(driver_output.text):
                 # We only need -expected.txt for a testharness test when we
                 # expect it to fail or produce additional console output (when
                 # -expected.txt is optional), so don't report missing
                 # -expected.txt for testharness tests.
                 pass
             elif not expected_driver_output.text:
                 failures.append(
                     test_failures.FailureMissingResult(driver_output,
                                                        expected_driver_output))
         elif self._report_extra_baseline(driver_output, '.txt',
                                          'does not produce text result'):
             failures.append(
                 test_failures.FailureTextNotGenerated(driver_output,
                                                       expected_driver_output))

         if driver_output.image_hash:
             if self._reference_files:
                 if self._report_extra_baseline(driver_output, '.png',
                                                'is a reftest'):
                     # TODO(wangxianzhu): Make this a failure.
                     pass
             else:
                 if not expected_driver_output.image:
                     failures.append(
                         test_failures.FailureMissingImage(
                             driver_output, expected_driver_output))
                 elif not expected_driver_output.image_hash:
                     failures.append(
                         test_failures.FailureMissingImageHash(
                             driver_output, expected_driver_output))
         elif self._report_extra_baseline(driver_output, '.png',
                                          'does not produce image result'):
             failures.append(
                 test_failures.FailureImageHashNotGenerated(
                     driver_output, expected_driver_output))

         if driver_output.audio:
             if not expected_driver_output.audio:
                 failures.append(
                     test_failures.FailureMissingAudio(driver_output,
                                                       expected_driver_output))
         elif self._report_extra_baseline(driver_output, '.wav',
                                          'does not produce audio result'):
             failures.append(
                 test_failures.FailureAudioNotGenerated(driver_output,
                                                        expected_driver_output))

         return failures

     def _compare_testharness_test(self, expected_driver_output, driver_output):
         """Returns (testharness_completed, testharness_failures)."""
         if not driver_output.text:
             return False, []
         if expected_driver_output.text:
             # Will compare text if there is expected text.
             return False, []
         if not testharness_results.is_testharness_output(driver_output.text):
             return False, []
         if not testharness_results.is_testharness_output_passing(
                 driver_output.text):
             return True, [
                 test_failures.FailureTestHarnessAssertion(
                     driver_output, expected_driver_output)
             ]
         return True, []

     def _is_render_tree(self, text):
         return text and 'layer at (0,0) size' in text

     def _is_layer_tree(self, text):
         return text and '{\n  "layers": [' in text

     def _compare_text(self, expected_driver_output, driver_output):
         expected_text = expected_driver_output.text
         actual_text = driver_output.text
         if not expected_text or not actual_text:
             return []

         normalized_actual_text = self._get_normalized_output_text(actual_text)
         # Assuming expected_text is already normalized.
         if not self._port.do_text_results_differ(expected_text,
                                                  normalized_actual_text):
             return []

         # Determine the text mismatch type

         def remove_chars(text, chars):
             for char in chars:
                 text = text.replace(char, '')
             return text

         def remove_ng_text(results):
             processed = re.sub(
                 r'LayoutNG(BlockFlow|ListItem|TableCell|FlexibleBox)',
                 r'Layout\1', results)
             # LayoutTableCaption doesn't override LayoutBlockFlow::GetName, so
             # render tree dumps have "LayoutBlockFlow" for captions.
             processed = re.sub('LayoutNGTableCaption', 'LayoutBlockFlow',
                                processed)
             return processed

         def is_ng_name_mismatch(expected, actual):
             if not re.search(
                     "LayoutNG(BlockFlow|ListItem|TableCaption|TableCell|FlexibleBox)",
                     actual):
                 return False
             if (not self._is_render_tree(actual)
                     and not self._is_layer_tree(actual)):
                 return False
             # There's a mix of NG and legacy names in both expected and actual,
             # so just remove NG from both.
             return not self._port.do_text_results_differ(
                 remove_ng_text(expected), remove_ng_text(actual))

         # LayoutNG name mismatch (e.g., LayoutBlockFlow vs. LayoutNGBlockFlow)
         # is treated as pass
         if is_ng_name_mismatch(expected_text, normalized_actual_text):
             return []

         # General text mismatch
         if self._port.do_text_results_differ(
                 remove_chars(expected_text, ' \t\n'),
                 remove_chars(normalized_actual_text, ' \t\n')):
             return [
                 test_failures.FailureTextMismatch(driver_output,
                                                   expected_driver_output)
             ]

         # Space-only mismatch
         if not self._port.do_text_results_differ(
                 remove_chars(expected_text, ' \t'),
                 remove_chars(normalized_actual_text, ' \t')):
             return [
                 test_failures.FailureSpacesAndTabsTextMismatch(
                     driver_output, expected_driver_output)
             ]

         # Newline-only mismatch
         if not self._port.do_text_results_differ(
                 remove_chars(expected_text, '\n'),
                 remove_chars(normalized_actual_text, '\n')):
             return [
                 test_failures.FailureLineBreaksTextMismatch(
                     driver_output, expected_driver_output)
             ]

         # Spaces and newlines
         return [
             test_failures.FailureSpaceTabLineBreakTextMismatch(
                 driver_output, expected_driver_output)
         ]

     def _compare_audio(self, expected_driver_output, driver_output):
         if not expected_driver_output.audio or not driver_output.audio:
             return []
         if self._port.do_audio_results_differ(expected_driver_output.audio,
                                               driver_output.audio):
             return [
                 test_failures.FailureAudioMismatch(driver_output,
                                                    expected_driver_output)
             ]
         return []

     def _get_normalized_output_text(self, output):
         """Returns the normalized text output, i.e. the output in which
         the end-of-line characters are normalized to "\n".
         """
         # Running tests on Windows produces "\r\n".  The "\n" part is helpfully
         # changed to "\r\n" by our system (Python/Cygwin), resulting in
         # "\r\r\n", when, in fact, we wanted to compare the text output with
         # the normalized text expectation files.
         return output.replace('\r\r\n', '\r\n').replace('\r\n', '\n')

     def _compare_image(self, expected_driver_output, driver_output):
         if not expected_driver_output.image or not expected_driver_output.image_hash:
             return []
         # The presence of an expected image, but a lack of an outputted image
         # does not signify an error. content::WebTestControlHost checks the
         # image_hash, and upon a match simply skips recording the outputted
         # image. This even occurs when results_directory is set.
         if not driver_output.image or not driver_output.image_hash:
             return []

         if driver_output.image_hash != expected_driver_output.image_hash:
             diff, err_str = self._port.diff_image(expected_driver_output.image,
                                                   driver_output.image)

             if diff:
                 driver_output.image_diff = diff

             if err_str:
                 _log.error('  %s : %s', self._test_name, err_str)
                 driver_output.error = (driver_output.error or '') + err_str

             if diff or err_str:
                 return [
                     test_failures.FailureImageHashMismatch(
                         driver_output, expected_driver_output)
                 ]
             else:
                 # See https://bugs.webkit.org/show_bug.cgi?id=69444 for why this
                 # isn't a full failure.
                 _log.warning('  %s -> pixel hash failed (but diff passed)',
                              self._test_name)

         return []

     def _run_reftest(self):
         test_output = self._driver.run_test(self._driver_input())
         total_test_time = test_output.test_time

         expected_text = self._port.expected_text(self._test_name)
         expected_text_output = DriverOutput(
             text=expected_text, image=None, image_hash=None, audio=None)
         # This _compare_output compares text if expected text exists, ignores
         # image, checks for extra baselines, and generates crash or timeout
         # failures if needed.
         compare_text_failures = self._compare_output(expected_text_output,
                                                      test_output)

         # If the test crashed, or timed out,  or a leak was detected, there's no point
         # in running the reference at all. This can save a lot of execution time if we
         # have a lot of crashes or timeouts.
         if test_output.crash or test_output.timeout or test_output.leak:
             return build_test_result(
                 test_output,
                 self._test_name,
                 retry_attempt=self._retry_attempt,
                 failures=compare_text_failures,
                 test_run_time=test_output.test_time,
                 pid=test_output.pid,
                 crash_site=test_output.crash_site)

         # A reftest can have multiple match references and multiple mismatch references;
         # the test fails if any mismatch matches and all of the matches don't match.
         # To minimize the number of references we have to check, we run all of the
         # mismatches first, then the matches, and short-circuit out as soon as we can.
         # Note that sorting by the expectation sorts "!=" before "==" so this is easy
         # to do.
         expected_output = None
         reference_test_names = []
         reftest_failures = []
         args = self._port.args_for_test(self._test_name)

         # sort self._reference_files to put mismatch tests first
         for expectation, reference_filename in sorted(self._reference_files):
             reference_test_name = self._port.relative_test_filename(
                 reference_filename)
             reference_test_names.append(reference_test_name)
             driver_input = DriverInput(
                 reference_test_name,
                 self._timeout_ms,
                 image_hash=test_output.image_hash,
                 args=args)
             expected_output = self._driver.run_test(driver_input)
             total_test_time += expected_output.test_time
             reftest_failures = self._compare_output_with_reference(
                 expected_output, test_output, reference_filename,
                 expectation == '!=')

             if ((expectation == '!=' and reftest_failures)
                     or (expectation == '==' and not reftest_failures)):
                 break

         assert expected_output

         # Combine compare_text_result and test_result
         expected_output.text = expected_text_output.text
         failures = reftest_failures + compare_text_failures

         # FIXME: We don't really deal with a mix of reftest types properly. We
         # pass in a set() to reftest_type and only really handle the first of
         # the references in the result.
         reftest_type = list(
             set([
                 reference_file[0] for reference_file in self._reference_files
             ]))

         return build_test_result(
             test_output,
             self._test_name,
             retry_attempt=self._retry_attempt,
             failures=failures,
             test_run_time=total_test_time,
             reftest_type=reftest_type,
             pid=test_output.pid,
             crash_site=test_output.crash_site,
             references=reference_test_names)

     def _compare_output_with_reference(self, reference_driver_output,
                                        actual_driver_output,
                                        reference_filename, mismatch):
         failures = []

         # Don't continue any more if we already have crash
         failures.extend(self._handle_error(actual_driver_output))
         if failures:
             return failures
         failures.extend(
             self._handle_error(
                 reference_driver_output,
                 reference_filename=reference_filename))
         if failures:
             return failures

         if not actual_driver_output.image_hash:
             failures.append(
                 test_failures.FailureReftestNoImageGenerated(
                     actual_driver_output, reference_driver_output,
                     reference_filename))
         elif not reference_driver_output.image_hash:
             failures.append(
                 test_failures.FailureReftestNoReferenceImageGenerated(
                     actual_driver_output, reference_driver_output,
                     reference_filename))
         elif mismatch:
             if reference_driver_output.image_hash == actual_driver_output.image_hash:
                 failures.append(
                     test_failures.FailureReftestMismatchDidNotOccur(
                         actual_driver_output, reference_driver_output,
                         reference_filename))
         elif reference_driver_output.image_hash != actual_driver_output.image_hash:
             max_channel_diff, max_pixels_diff = self._port.get_wpt_fuzzy_metadata(
                 self._test_name)
             diff, err_str = self._port.diff_image(
                 reference_driver_output.image,
                 actual_driver_output.image,
                 max_channel_diff=max_channel_diff,
                 max_pixels_diff=max_pixels_diff)
             if diff:
                 actual_driver_output.image_diff = diff

             if err_str:
                 _log.error(err_str)
                 actual_driver_output.error = (actual_driver_output.error
                                               or '') + err_str

             if diff or err_str:
                 failures.append(
                     test_failures.FailureReftestMismatch(
                         actual_driver_output, reference_driver_output,
                         reference_filename))
             elif err_str:
                 # TODO(rmhasan) Should we include this error message in the artifacts ?
                 _log.error('  %s : %s', self._test_name, err_str)
             else:
                 _log.warning(
                     "  %s -> ref test hashes didn't match but diff passed",
                     self._test_name)

         return failures