recipetool: create: add license file crunching

Matching license texts directly to md5sums only goes so far. Some
licenses make the copyright statement an intrinsic part of the license
statement (e.g. MIT) which of course varies between projects. Also,
people often seem to take standard license texts such as GPLv2 and
reformat them cosmetically - re-wrapping lines at a different width or
changing quoting styles are seemingly popular examples. In order to
match license files to their actual licenses more effectively, "crunch"
out these elements before comparing to an md5sum. (The existing plain
md5sum matching has been left in since it's a shortcut, and our list of
crunched md5sums isn't a complete replacement for it.)

As always, this code isn't providing any guarantees (legal or otherwise)
that it will always get the license correct - as indicated by the
accompanying comments the LICENSE values it writes out to the recipe are
indicative and you should verify them yourself by looking at the
documentation supplied from upstream for the software being built if you
have any concerns.

(From OE-Core rev: 553bb4ea5d51be5179e7d8c019740cf61ece76ea)

Signed-off-by: Paul Eggleton <paul.eggleton@linux.intel.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
This commit is contained in:
Paul Eggleton 2016-03-09 17:48:51 +13:00 committed by Richard Purdie
parent 3fd244b94f
commit d46827cfd3
1 changed files with 76 additions and 1 deletions

View File

@ -25,6 +25,7 @@ import json
import logging
import scriptutils
import urlparse
import hashlib
logger = logging.getLogger('recipetool')
@ -717,6 +718,76 @@ def get_license_md5sums(d, static_only=False):
md5sums['54c7042be62e169199200bc6477f04d1'] = 'BSD-3-Clause'
return md5sums
def crunch_license(licfile):
'''
Remove non-material text from a license file and then check
its md5sum against a known list. This works well for licenses
which contain a copyright statement, but is also a useful way
to handle people's insistence upon reformatting the license text
slightly (with no material difference to the text of the
license).
'''
import oe.utils
# Note: these are carefully constructed!
license_title_re = re.compile('^\(?(#+ *)?(The )?.{1,10} [Ll]icen[sc]e( \(.{1,10}\))?\)?:?$')
license_statement_re = re.compile('^This (project|software) is( free software)? released under the .{1,10} [Ll]icen[sc]e:?$')
copyright_re = re.compile('^(#+)? *Copyright .*$')
crunched_md5sums = {}
# The following two were gleaned from the "forever" npm package
crunched_md5sums['0a97f8e4cbaf889d6fa51f84b89a79f6'] = 'ISC'
crunched_md5sums['eecf6429523cbc9693547cf2db790b5c'] = 'MIT'
# https://github.com/vasi/pixz/blob/master/LICENSE
crunched_md5sums['2f03392b40bbe663597b5bd3cc5ebdb9'] = 'BSD-2-Clause'
# https://github.com/waffle-gl/waffle/blob/master/LICENSE.txt
crunched_md5sums['e72e5dfef0b1a4ca8a3d26a60587db66'] = 'BSD-2-Clause'
# https://github.com/spigwitmer/fakeds1963s/blob/master/LICENSE
crunched_md5sums['8be76ac6d191671f347ee4916baa637e'] = 'GPLv2'
# https://github.com/datto/dattobd/blob/master/COPYING
# http://git.savannah.gnu.org/cgit/freetype/freetype2.git/tree/docs/GPLv2.TXT
crunched_md5sums['1d65c5ad4bf6489f85f4812bf08ae73d'] = 'GPLv2'
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
# http://git.neil.brown.name/?p=mdadm.git;a=blob;f=COPYING;h=d159169d1050894d3ea3b98e1c965c4058208fe1;hb=HEAD
crunched_md5sums['fb530f66a7a89ce920f0e912b5b66d4b'] = 'GPLv2'
# https://github.com/gkos/nrf24/blob/master/COPYING
crunched_md5sums['7b6aaa4daeafdfa6ed5443fd2684581b'] = 'GPLv2'
# https://github.com/josch09/resetusb/blob/master/COPYING
crunched_md5sums['8b8ac1d631a4d220342e83bcf1a1fbc3'] = 'GPLv3'
# https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv2.1
crunched_md5sums['2ea316ed973ae176e502e2297b574bb3'] = 'LGPLv2.1'
# unixODBC-2.3.4 COPYING
crunched_md5sums['1daebd9491d1e8426900b4fa5a422814'] = 'LGPLv2.1'
# https://github.com/FFmpeg/FFmpeg/blob/master/COPYING.LGPLv3
crunched_md5sums['2ebfb3bb49b9a48a075cc1425e7f4129'] = 'LGPLv3'
lictext = []
with open(licfile, 'r') as f:
for line in f:
# Drop opening statements
if copyright_re.match(line):
continue
elif license_title_re.match(line):
continue
elif license_statement_re.match(line):
continue
# Squash spaces, and replace smart quotes, double quotes
# and backticks with single quotes
line = oe.utils.squashspaces(line.strip()).decode("utf-8")
line = line.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","'").replace(u"\u201d", "'").replace('"', '\'').replace('`', '\'')
if line:
lictext.append(line)
m = hashlib.md5()
try:
m.update(' '.join(lictext))
md5val = m.hexdigest()
except UnicodeEncodeError:
md5val = None
lictext = ''
license = crunched_md5sums.get(md5val, None)
return license, md5val, lictext
def guess_license(srctree):
import bb
md5sums = get_license_md5sums(tinfoil.config_data)
@ -733,7 +804,11 @@ def guess_license(srctree):
licfiles.append(fullpath)
for licfile in licfiles:
md5value = bb.utils.md5_file(licfile)
license = md5sums.get(md5value, 'Unknown')
license = md5sums.get(md5value, None)
if not license:
license, crunched_md5, lictext = crunch_license(licfile)
if not license:
license = 'Unknown'
licenses.append((license, os.path.relpath(licfile, srctree), md5value))
# FIXME should we grab at least one source file with a license header and add that too?