Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix license discovery: split at dashes for word-based matching #543

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions grayskull/license/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,16 @@ def get_all_licenses_from_spdx() -> List:
]


def _replace_dashes(s: str) -> str:
"""
Replace dashes with spaces.

:param s: string to replace dashes with spaces
:return: string with dashes replaced by spaces
"""
return s.replace("-", " ")


def _match_scrambled_exact(candidate, licenses) -> str | None:
"""
Return license with rearranged word order only.
Expand All @@ -88,6 +98,9 @@ def match_license(name: str) -> dict:
name = re.sub(r"\s+license\s*", "", name.strip(), flags=re.IGNORECASE)
name = name.strip()

if name in _get_all_license_choice(all_licenses):
return _get_license(name, all_licenses)

exact_match = _match_scrambled_exact(name, _get_all_license_choice(all_licenses))
if exact_match:
best_matches = [(exact_match, 100, 0)]
Expand Down Expand Up @@ -130,12 +143,21 @@ def match_license(name: str) -> dict:
lic[0] for lic in original_matches if lic[1] >= spdx_license[1]
]
if len(best_matches) > 1:
# we replace dashes by spaces here to match instances like
# "3-Clause BSD" with "BSD-3-Clause" which otherwise would
# not work with word-based scores like token_sort_ratio
spdx_license = process.extractOne(
name, best_matches, scorer=token_sort_ratio
name,
best_matches,
scorer=token_sort_ratio,
processor=_replace_dashes,
)
if original_matches and original_matches[0][1] < 0.55:
spdx_license = process.extractOne(
name, [m[0] for m in original_matches], scorer=token_sort_ratio
name,
[m[0] for m in original_matches],
scorer=token_sort_ratio,
processor=_replace_dashes,
)

if spdx_license[1] != 100 and spdx_license[0].startswith("MIT"):
Expand Down
7 changes: 7 additions & 0 deletions tests/license/test_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,13 @@ def test_short_license_id(licence_name, short_licence):
assert get_short_license_id(licence_name) == short_licence


@pytest.mark.parametrize(
"license_id", [lic["licenseId"] for lic in get_all_licenses_from_spdx()]
)
def test_short_license_id_map_to_self(license_id: str):
assert get_short_license_id(license_id) == license_id


def test_get_other_names_from_opensource():
assert sorted(get_other_names_from_opensource("MIT")) == sorted(["MIT", "Expat"])

Expand Down