Skip to content

Scan package files and extract for packages #1207

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions scanpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1909,6 +1909,17 @@ def no_status(self, status=None):
return self.filter(~Q(status=status))
return self.filter(status="")

def package_files(self):
"""
Filter for CodebaseResources which are part of either an application
package or a system package.
"""
from scanpipe.pipes import flag

return self.filter(
Q(status=flag.APPLICATION_PACKAGE) | Q(status=flag.SYSTEM_PACKAGE)
)

def empty(self):
return self.filter(Q(size__isnull=True) | Q(size=0))

Expand Down
2 changes: 2 additions & 0 deletions scanpipe/pipelines/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def steps(cls):
return (
cls.extract_images,
cls.extract_layers,
cls.extract_archives,
cls.find_images_os_and_distro,
cls.collect_images_information,
cls.collect_and_create_codebase_resources,
Expand All @@ -42,6 +43,7 @@ def steps(cls):
cls.flag_ignored_resources,
cls.scan_for_application_packages,
cls.scan_for_files,
cls.scan_package_files,
cls.analyze_scanned_files,
cls.flag_not_analyzed_codebase_resources,
)
Expand Down
2 changes: 2 additions & 0 deletions scanpipe/pipelines/docker_windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def steps(cls):
return (
cls.extract_images,
cls.extract_layers,
cls.extract_archives,
cls.find_images_os_and_distro,
cls.collect_images_information,
cls.collect_and_create_codebase_resources,
Expand All @@ -45,6 +46,7 @@ def steps(cls):
cls.flag_ignored_resources,
cls.scan_for_application_packages,
cls.scan_for_files,
cls.scan_package_files,
cls.analyze_scanned_files,
cls.flag_data_files_with_no_clues,
cls.flag_not_analyzed_codebase_resources,
Expand Down
11 changes: 10 additions & 1 deletion scanpipe/pipelines/root_filesystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class RootFS(Pipeline):
def steps(cls):
return (
cls.extract_input_files_to_codebase_directory,
cls.extract_archives,
cls.find_root_filesystems,
cls.collect_rootfs_information,
cls.collect_and_create_codebase_resources,
Expand All @@ -45,6 +46,7 @@ def steps(cls):
cls.scan_for_application_packages,
cls.match_not_analyzed_to_system_packages,
cls.scan_for_files,
cls.scan_package_files,
cls.analyze_scanned_files,
cls.flag_not_analyzed_codebase_resources,
)
Expand Down Expand Up @@ -89,7 +91,7 @@ def collect_and_create_system_packages(self):
rootfs.scan_rootfs_for_system_packages(self.project, rfs)

def flag_uninteresting_codebase_resources(self):
"""Flag files—not worth tracking—that don’t belong to any system packages."""
"""Flag files—not worth tracking—that do not belong to any system packages."""
rootfs.flag_uninteresting_codebase_resources(self.project)

def scan_for_application_packages(self):
Expand Down Expand Up @@ -123,6 +125,13 @@ def scan_for_files(self):
"""Scan unknown resources for copyrights, licenses, emails, and urls."""
scancode.scan_for_files(self.project, progress_logger=self.log)

def scan_package_files(self):
"""
Scan files which are part of a package, for copyright, license, email
and urls.
"""
scancode.scan_package_files(self.project, progress_logger=self.log)

def analyze_scanned_files(self):
"""Analyze single file scan results for completeness."""
flag.analyze_scanned_files(self.project)
Expand Down
8 changes: 8 additions & 0 deletions scanpipe/pipelines/scan_codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def steps(cls):
cls.flag_ignored_resources,
cls.scan_for_application_packages,
cls.scan_for_files,
cls.scan_package_files,
)

def copy_inputs_to_codebase_directory(self):
Expand All @@ -65,3 +66,10 @@ def scan_for_application_packages(self):
def scan_for_files(self):
"""Scan unknown resources for copyrights, licenses, emails, and urls."""
scancode.scan_for_files(self.project, progress_logger=self.log)

def scan_package_files(self):
"""
Scan files which are manifests for detected application packages, for copyright,
license, email and urls.
"""
scancode.scan_package_files(self.project, progress_logger=self.log)
50 changes: 45 additions & 5 deletions scanpipe/pipes/scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,9 @@ def scan_for_package_data(location, with_threading=True, package_only=False, **k
return _scan_resource(location, scanners, with_threading=with_threading)


def save_scan_file_results(codebase_resource, scan_results, scan_errors):
def save_scan_file_results(
codebase_resource, scan_results, scan_errors, update_status=True, **kwargs
):
"""
Save the resource scan file results in the database.
Create project errors if any occurred during the scan.
Expand All @@ -263,6 +265,9 @@ def save_scan_file_results(codebase_resource, scan_results, scan_errors):
codebase_resource.add_errors(scan_errors)
status = flag.SCANNED_WITH_ERROR

if not update_status:
status = None

codebase_resource.set_scan_results(scan_results, status)


Expand All @@ -283,7 +288,12 @@ def save_scan_package_results(codebase_resource, scan_results, scan_errors):


def scan_resources(
resource_qs, scan_func, save_func, scan_func_kwargs=None, progress_logger=None
resource_qs,
scan_func,
save_func,
scan_func_kwargs=None,
save_func_kwargs=None,
progress_logger=None,
):
"""
Run the `scan_func` on the codebase resources of the provided `resource_qs`.
Expand All @@ -303,6 +313,9 @@ def scan_resources(
if not scan_func_kwargs:
scan_func_kwargs = {}

if not save_func_kwargs:
save_func_kwargs = {}

resource_count = resource_qs.count()
logger.info(f"Scan {resource_count} codebase resources with {scan_func.__name__}")
resource_iterator = resource_qs.iterator(chunk_size=2000)
Expand All @@ -317,7 +330,7 @@ def scan_resources(
scan_results, scan_errors = scan_func(
resource.location, with_threading, **scan_func_kwargs
)
save_func(resource, scan_results, scan_errors)
save_func(resource, scan_results, scan_errors, **save_func_kwargs)
return

logger.info(f"Starting ProcessPoolExecutor with {max_workers} max_workers")
Expand All @@ -344,10 +357,10 @@ def scan_resources(
"CPU core for successful execution."
)
raise broken_pool_error from InsufficientResourcesError(message)
save_func(resource, scan_results, scan_errors)
save_func(resource, scan_results, scan_errors, **save_func_kwargs)


def scan_for_files(project, resource_qs=None, progress_logger=None):
def scan_for_files(project, resource_qs=None, progress_logger=None, update_status=True):
"""
Run a license, copyright, email, and url scan on files without a status for
a `project`.
Expand All @@ -363,12 +376,39 @@ def scan_for_files(project, resource_qs=None, progress_logger=None):
if license_score := project.get_env("scancode_license_score"):
scan_func_kwargs["min_license_score"] = license_score

save_func_kwargs = {
"update_status": update_status,
}

scan_resources(
resource_qs=resource_qs,
scan_func=scan_file,
save_func=save_scan_file_results,
scan_func_kwargs=scan_func_kwargs,
save_func_kwargs=save_func_kwargs,
progress_logger=progress_logger,
)


def scan_package_files(
project,
progress_logger=None,
update_status=False,
):
"""
Scan files which are part of a package, for copyright, license, email
and urls.

If `update_status` is False, the status field of codebase resources is not
updated to `scanned` (which is a side-effect of scanning files), but rather
keep the old status intact.
"""
package_files = project.codebaseresources.package_files()
scan_for_files(
project=project,
resource_qs=package_files,
progress_logger=progress_logger,
update_status=update_status,
)


Expand Down
Loading