Skip to content

Commit

Permalink
Improve copyright detection
Browse files Browse the repository at this point in the history
Signed-off-by: Philippe Ombredanne <[email protected]>
  • Loading branch information
pombredanne committed Sep 6, 2024
1 parent be5cdba commit 7f72ab9
Show file tree
Hide file tree
Showing 27 changed files with 217 additions and 0 deletions.
26 changes: 26 additions & 0 deletions src/cluecode/copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,8 @@ def build_detection_from_node(
'(20[0-3][0-9][\\.,\\-])+20[0-3][0-9]' # 2001-2012
'|'
'(20[0-3][0-9][\\.,\\-])+20[0-3]x' # 2001-201x
'|'
'(20[0-3][0-9][\\.,\\-])+20[0-3][0-9]a' # 2001-2012a
')')

_PUNCT = (
Expand Down Expand Up @@ -1257,6 +1259,7 @@ def build_detection_from_node(
(r'^Activation\.?$', 'NN'),
(r'^Act[\.,]?$', 'NN'),
(r'^Added$', 'NN'),
(r'^added$', 'JUNK'),
(r'^As$', 'NN'),
(r'^I$', 'NN'),
(r'^Additional$', 'NN'),
Expand Down Expand Up @@ -1350,6 +1353,11 @@ def build_detection_from_node(
(r'^Entity$', 'NN'),
(r'^Example', 'NN'),
(r'^Except', 'NN'),
(r'^Fragments$', 'NN'),
(r'^With$', 'NN'),
(r'^Tick$', 'NN'),
(r'^Dynamic$', 'NN'),

(r'^When$', 'NN'),
# (r'^Owner$', 'NN'),
(r'^Specifications?$', 'NN'),
Expand Down Expand Up @@ -1567,6 +1575,14 @@ def build_detection_from_node(
(r'^They$', 'JUNK'),
(r'^Branched$', 'NN'),

(r'^Improved$', 'NN'),
(r'^Designed$', 'NN'),
(r'^Organised$', 'NN'),
(r'^Re-organised$', 'NN'),
(r'^Swap$', 'NN'),
(r'^Adapted$', 'JUNK'),
(r'^Thumb$', 'NN'),

# alone this is not enough for an NNP
(r'^Free$', 'NN'),

Expand Down Expand Up @@ -1620,6 +1636,7 @@ def build_detection_from_node(
(r'^Unlike$', 'NN'),
(r'^Compression$', 'NN'),
(r'^Letter$', 'NN'),
(r'^Moved$', 'NN'),

# dual caps that are not NNP
(r'^Make[A-Z]', 'JUNK'),
Expand Down Expand Up @@ -1799,6 +1816,9 @@ def build_detection_from_node(
(r'^(S\.?A\.?S?|Sas|sas|A\/S|AG,?|AB|Labs?|[Cc][Oo]|Research|Center|INRIA|Societe|KG)[,\.]?$', 'COMP'),
# French SARL
(r'^(SARL|S\.A\.R\.L\.)[\.,\)]*$', 'COMP'),
# More company suffix : a.s. in Czechia and otehrs
(r'^(a\.s\.|S\.r\.l\.?)$', 'COMP'),
(r'^Vertriebsges\.m\.b\.H\.?,?$', 'COMP'),

# company suffix : AS: this is frequent beyond Norway.
(r'^AS', 'CAPS'),
Expand Down Expand Up @@ -2070,6 +2090,9 @@ def build_detection_from_node(
# proper noun with apostrophe ': d'Itri
(r"^[a-z]'[A-Z]?[a-z]+[,\.]?$", 'NNP'),

# exceptions to all CAPS words
(r'^[A-Z]{3,4}[0-9]{4},?$', 'NN'),

# all CAPS word, at least 1 char long such as MIT, including an optional trailing comma or dot
(r'^[A-Z0-9]+,?$', 'CAPS'),

Expand Down Expand Up @@ -2680,6 +2703,9 @@ def build_detection_from_node(
# Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
COPYRIGHT: {<COPY>+ <NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #157999
# Copyright (c) 2014 Czech Technical University in Prague
COPYRIGHT: {<COPYRIGHT> <NN> <UNI> <NAME>} #157999-name
COPYRIGHT: {<COPY>+ <CAPS|NNP>+ <CC> <NN> <COPY> <YR-RANGE>?} #1590
# // (c) (C) → ©
Expand Down
2 changes: 2 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/2011a.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
* Copyright 2010-2011a Analog Devices Inc.

8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/2011a.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright 2010-2011a Analog Devices Inc.
holders:
- Analog Devices Inc.
3 changes: 3 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/adapted.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Copyright (C) 1998 Frederic Rible F1OAT ([email protected])
Adapted from baycom.c driver written by Thomas Sailer ([email protected])

10 changes: 10 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/adapted.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 1998 Frederic Rible F1OAT ([email protected])
holders:
- Frederic Rible F1OAT
authors:
- Thomas Sailer ([email protected])
2 changes: 2 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/adapted2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
(c) 2004 MontaVista Software, Inc.
* - Adapted from gdb/sim/arm/thumbemu.c
8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/adapted2.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- (c) 2004 MontaVista Software, Inc.
holders:
- MontaVista Software, Inc.
3 changes: 3 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/added.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.:
* - added disk storage for bitmap

8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/added.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 2003-2004, Paul Clements, SteelEye Technology, Inc.
holders:
- Paul Clements, SteelEye Technology, Inc.
3 changes: 3 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/ak.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
* Copyright (C) 2012 AK signal Brno a.s.
* 2012 Jiri Prchal <[email protected]>

8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/ak.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 2012 AK signal Brno a.s. 2012 Jiri Prchal <[email protected]>
holders:
- AK signal Brno a.s. Jiri Prchal
3 changes: 3 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/caps.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Copyright (c) 2005-2006 Mauro Carvalho Chehab <[email protected]>
SAA7111, SAA7113 and SAA7118 support

8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/caps.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 2005-2006 Mauro Carvalho Chehab <[email protected]>
holders:
- Mauro Carvalho Chehab
2 changes: 2 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/ctu.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Copyright: (c) 2014 Czech Technical University in Prague

8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/ctu.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 2014 Czech Technical University in Prague
holders:
- Czech Technical University in Prague
30 changes: 30 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/dynamic.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
Copyright (c) Dynamic Network Services, Inc
Copyright (c) frisB.com &lt;[email protected]&gt;






Copyright (c) dynamic-evaluation, https://github.com/benkrause/dynamic-evaluation








copyright (c) Dynamic Drive (www.dynamicdrive.com)





Copyright (C) Dynamic System Bars Project





# Copyright (c) Dynamic Solutions. All rights reserved.
18 changes: 18 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/dynamic.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) Dynamic Network Services, Inc
- Copyright (c) frisB.com <[email protected]>
- Copyright (c) dynamic-evaluation, https://github.com/benkrause/dynamic-evaluation
- copyright (c) Dynamic Drive (www.dynamicdrive.com)
- Copyright (c) Dynamic System Bars Project
- Copyright (c) Dynamic Solutions
holders:
- Dynamic Network Services, Inc
- frisB.com
- dynamic-evaluation
- Dynamic Drive
- Dynamic System Bars Project
- Dynamic Solutions
3 changes: 3 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/improved.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
* Copyright (C) 1998 Andrea Arcangeli
* 1999-03-10 Improved NTP compatibility by Ulrich Windl

8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/improved.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 1998 Andrea Arcangeli 1999-03-10
holders:
- Andrea Arcangeli
7 changes: 7 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/irq.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Copyright (C) 1992 Linus Torvalds
Modifications for ARM processor Copyright (C) 1995-2000 Russell King.

Support for Dynamic Tick Timer Copyright (C) 2004-2005 Nokia Corporation.
Dynamic Tick Timer written by Tony Lindgren <[email protected]> and
Tuukka Tikkanen <[email protected]>.

14 changes: 14 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/irq.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 1992 Linus Torvalds
- Copyright (c) 1995-2000 Russell King
- Copyright (c) 2004-2005 Nokia Corporation
holders:
- Linus Torvalds
- Russell King
- Nokia Corporation
authors:
- Tony Lindgren <[email protected]> and Tuukka Tikkanen <[email protected]>
4 changes: 4 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/moved.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Extra data:
* Copyright (C) 1994-1999 Russell King
* Moved from linux/arch/arm/kernel/debug.S by Ben Dooks

8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/moved.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 1994-1999 Russell King
holders:
- Russell King
2 changes: 2 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/reorg.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
* Copyright (C) 1991-1998 Linus Torvalds
* Re-organised Feb 1998 Russell King
8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/reorg.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 1991-1998 Linus Torvalds
holders:
- Linus Torvalds
3 changes: 3 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/with-add.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Copyright (C) 1997 Sigurdur Asgeirsson
With additional hacking by Jeffrey Kuskin ([email protected])

10 changes: 10 additions & 0 deletions tests/cluecode/data/copyrights/misco4/linux/with-add.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
what:
- copyrights
- holders
- authors
copyrights:
- Copyright (c) 1997 Sigurdur Asgeirsson
holders:
- Sigurdur Asgeirsson
authors:
- Jeffrey Kuskin ([email protected])

0 comments on commit 7f72ab9

Please sign in to comment.