Skip to content

Commit bd5b14e

Browse files
Scan package files and extract for packages
For rootfs pipelines (rootfs, docker, docker-windows) all package files which were a part of system packages had their status updated and consequently were not being scanned for licenses, copyrights, emails and urls. We were also not scanning package metadata files tagged as application packages in scan_codebase and the rootfs pipelines. This commit scans all package files and package metadata files to make sure we are not missing any information. Reference: #762 Reference: #1194 Reference: #83 Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent 77faa3f commit bd5b14e

12 files changed

+6913
-208
lines changed

scanpipe/models.py

+11
Original file line numberDiff line numberDiff line change
@@ -1909,6 +1909,17 @@ def no_status(self, status=None):
19091909
return self.filter(~Q(status=status))
19101910
return self.filter(status="")
19111911

1912+
def package_files(self):
1913+
"""
1914+
Filter for CodebaseResources which are part of either an application
1915+
package or a system package.
1916+
"""
1917+
from scanpipe.pipes import flag
1918+
1919+
return self.filter(
1920+
Q(status=flag.APPLICATION_PACKAGE) | Q(status=flag.SYSTEM_PACKAGE)
1921+
)
1922+
19121923
def empty(self):
19131924
return self.filter(Q(size__isnull=True) | Q(size=0))
19141925

scanpipe/pipelines/docker.py

+2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def steps(cls):
3333
return (
3434
cls.extract_images,
3535
cls.extract_layers,
36+
cls.extract_archives,
3637
cls.find_images_os_and_distro,
3738
cls.collect_images_information,
3839
cls.collect_and_create_codebase_resources,
@@ -42,6 +43,7 @@ def steps(cls):
4243
cls.flag_ignored_resources,
4344
cls.scan_for_application_packages,
4445
cls.scan_for_files,
46+
cls.scan_package_files,
4547
cls.analyze_scanned_files,
4648
cls.flag_not_analyzed_codebase_resources,
4749
)

scanpipe/pipelines/docker_windows.py

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def steps(cls):
3434
return (
3535
cls.extract_images,
3636
cls.extract_layers,
37+
cls.extract_archives,
3738
cls.find_images_os_and_distro,
3839
cls.collect_images_information,
3940
cls.collect_and_create_codebase_resources,
@@ -45,6 +46,7 @@ def steps(cls):
4546
cls.flag_ignored_resources,
4647
cls.scan_for_application_packages,
4748
cls.scan_for_files,
49+
cls.scan_package_files,
4850
cls.analyze_scanned_files,
4951
cls.flag_data_files_with_no_clues,
5052
cls.flag_not_analyzed_codebase_resources,

scanpipe/pipelines/root_filesystem.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class RootFS(Pipeline):
3535
def steps(cls):
3636
return (
3737
cls.extract_input_files_to_codebase_directory,
38+
cls.extract_archives,
3839
cls.find_root_filesystems,
3940
cls.collect_rootfs_information,
4041
cls.collect_and_create_codebase_resources,
@@ -45,6 +46,7 @@ def steps(cls):
4546
cls.scan_for_application_packages,
4647
cls.match_not_analyzed_to_system_packages,
4748
cls.scan_for_files,
49+
cls.scan_package_files,
4850
cls.analyze_scanned_files,
4951
cls.flag_not_analyzed_codebase_resources,
5052
)
@@ -89,7 +91,7 @@ def collect_and_create_system_packages(self):
8991
rootfs.scan_rootfs_for_system_packages(self.project, rfs)
9092

9193
def flag_uninteresting_codebase_resources(self):
92-
"""Flag files—not worth tracking—that don’t belong to any system packages."""
94+
"""Flag files—not worth tracking—that do not belong to any system packages."""
9395
rootfs.flag_uninteresting_codebase_resources(self.project)
9496

9597
def scan_for_application_packages(self):
@@ -123,6 +125,13 @@ def scan_for_files(self):
123125
"""Scan unknown resources for copyrights, licenses, emails, and urls."""
124126
scancode.scan_for_files(self.project, progress_logger=self.log)
125127

128+
def scan_package_files(self):
129+
"""
130+
Scan files which are part of a package, for copyright, license, email
131+
and urls.
132+
"""
133+
scancode.scan_package_files(self.project, progress_logger=self.log)
134+
126135
def analyze_scanned_files(self):
127136
"""Analyze single file scan results for completeness."""
128137
flag.analyze_scanned_files(self.project)

scanpipe/pipelines/scan_codebase.py

+8
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def steps(cls):
4545
cls.flag_ignored_resources,
4646
cls.scan_for_application_packages,
4747
cls.scan_for_files,
48+
cls.scan_package_files,
4849
)
4950

5051
def copy_inputs_to_codebase_directory(self):
@@ -65,3 +66,10 @@ def scan_for_application_packages(self):
6566
def scan_for_files(self):
6667
"""Scan unknown resources for copyrights, licenses, emails, and urls."""
6768
scancode.scan_for_files(self.project, progress_logger=self.log)
69+
70+
def scan_package_files(self):
71+
"""
72+
Scan files which are manifests for detected application packages, for copyright,
73+
license, email and urls.
74+
"""
75+
scancode.scan_package_files(self.project, progress_logger=self.log)

scanpipe/pipes/scancode.py

+45-5
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,9 @@ def scan_for_package_data(location, with_threading=True, package_only=False, **k
252252
return _scan_resource(location, scanners, with_threading=with_threading)
253253

254254

255-
def save_scan_file_results(codebase_resource, scan_results, scan_errors):
255+
def save_scan_file_results(
256+
codebase_resource, scan_results, scan_errors, update_status=True, **kwargs
257+
):
256258
"""
257259
Save the resource scan file results in the database.
258260
Create project errors if any occurred during the scan.
@@ -263,6 +265,9 @@ def save_scan_file_results(codebase_resource, scan_results, scan_errors):
263265
codebase_resource.add_errors(scan_errors)
264266
status = flag.SCANNED_WITH_ERROR
265267

268+
if not update_status:
269+
status = None
270+
266271
codebase_resource.set_scan_results(scan_results, status)
267272

268273

@@ -283,7 +288,12 @@ def save_scan_package_results(codebase_resource, scan_results, scan_errors):
283288

284289

285290
def scan_resources(
286-
resource_qs, scan_func, save_func, scan_func_kwargs=None, progress_logger=None
291+
resource_qs,
292+
scan_func,
293+
save_func,
294+
scan_func_kwargs=None,
295+
save_func_kwargs=None,
296+
progress_logger=None,
287297
):
288298
"""
289299
Run the `scan_func` on the codebase resources of the provided `resource_qs`.
@@ -303,6 +313,9 @@ def scan_resources(
303313
if not scan_func_kwargs:
304314
scan_func_kwargs = {}
305315

316+
if not save_func_kwargs:
317+
save_func_kwargs = {}
318+
306319
resource_count = resource_qs.count()
307320
logger.info(f"Scan {resource_count} codebase resources with {scan_func.__name__}")
308321
resource_iterator = resource_qs.iterator(chunk_size=2000)
@@ -317,7 +330,7 @@ def scan_resources(
317330
scan_results, scan_errors = scan_func(
318331
resource.location, with_threading, **scan_func_kwargs
319332
)
320-
save_func(resource, scan_results, scan_errors)
333+
save_func(resource, scan_results, scan_errors, **save_func_kwargs)
321334
return
322335

323336
logger.info(f"Starting ProcessPoolExecutor with {max_workers} max_workers")
@@ -344,10 +357,10 @@ def scan_resources(
344357
"CPU core for successful execution."
345358
)
346359
raise broken_pool_error from InsufficientResourcesError(message)
347-
save_func(resource, scan_results, scan_errors)
360+
save_func(resource, scan_results, scan_errors, **save_func_kwargs)
348361

349362

350-
def scan_for_files(project, resource_qs=None, progress_logger=None):
363+
def scan_for_files(project, resource_qs=None, progress_logger=None, update_status=True):
351364
"""
352365
Run a license, copyright, email, and url scan on files without a status for
353366
a `project`.
@@ -363,12 +376,39 @@ def scan_for_files(project, resource_qs=None, progress_logger=None):
363376
if license_score := project.get_env("scancode_license_score"):
364377
scan_func_kwargs["min_license_score"] = license_score
365378

379+
save_func_kwargs = {
380+
"update_status": update_status,
381+
}
382+
366383
scan_resources(
367384
resource_qs=resource_qs,
368385
scan_func=scan_file,
369386
save_func=save_scan_file_results,
370387
scan_func_kwargs=scan_func_kwargs,
388+
save_func_kwargs=save_func_kwargs,
389+
progress_logger=progress_logger,
390+
)
391+
392+
393+
def scan_package_files(
394+
project,
395+
progress_logger=None,
396+
update_status=False,
397+
):
398+
"""
399+
Scan files which are part of a package, for copyright, license, email
400+
and urls.
401+
402+
If `update_status` is False, the status field of codebase resources is not
403+
updated to `scanned` (which is a side-effect of scanning files), but rather
404+
keep the old status intact.
405+
"""
406+
package_files = project.codebaseresources.package_files()
407+
scan_for_files(
408+
project=project,
409+
resource_qs=package_files,
371410
progress_logger=progress_logger,
411+
update_status=update_status,
372412
)
373413

374414

0 commit comments

Comments
 (0)