Commit 75970bdc authored by Thomas Boni's avatar Thomas Boni
Browse files

Merge branch...

Merge branch '241-add-an-antivirus-verification-for-every-docker-image-in-every-job-2' into 'latest'

Resolve "Add an antivirus verification for every docker image in every job"

Closes #241

See merge request r2devops/hub!199
parents bfe2b0f0 3f2fc43b
Loading
Loading
Loading
Loading
+67 −0
Original line number Diff line number Diff line
stages:
  - static_tests
  - merge_tests
  - project_setup
  - build
  - dynamic_tests
@@ -250,3 +251,69 @@ release:
    done
  rules:
    - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PROJECT_PATH_SLUG == "r2devops-hub"'

refresh_job_av_database:
  image: python:3.9.1-alpine
  stage: static_tests
  variables:
    PIPENV_PIPFILE: tools/job_av/Pipfile
    PYTHONPATH: "./:${PYTHONPATH}"
  script:
    - apk add --no-cache python3 py3-pip clamav util-linux
    - pip install --ignore-installed distlib pipenv
    - pipenv install

    # Updating the ClamAV database to push it into the cache
    - freshclam

    # Runner cannot access /var/lib/clamav, so we need to copy it
    # in order to cache it properly
    - mkdir freshclam_db
    - cp /var/lib/clamav/* freshclam_db
  cache:
    key: "clamav-db"
    paths:
      - freshclam_db
    policy: push
  rules:
    - if: '$CI_PIPELINE_SOURCE == "schedule" && $CI_CLAMAV_PIPELINE && $CI_PROJECT_PATH_SLUG == "r2devops-hub"'

generate_job_av:
  image: python:3.9.1-alpine
  stage: static_tests
  variables:
    PIPENV_PIPFILE: tools/job_av/Pipfile
    JOB_LOGFILE: "generate_job.log"
    GENERATED_YAML: "generated-gitlab-ci.yml"
    PYTHONPATH: "./:${PYTHONPATH}"
    SCANNED_IMAGES_FILE: "clamav_scanned_images.json"
  script:
    - if [ -f "${SCANNED_IMAGES_FILE}" ]; then cat ${SCANNED_IMAGES_FILE}; fi
    - apk add --no-cache python3 py3-pip util-linux
    - pip install --ignore-installed distlib pipenv
    - pipenv install

    - pipenv run python3 tools/job_av/job_av.py
    - cat $GENERATED_YAML
  artifacts:
    paths:
      - ${GENERATED_YAML}
    expire_in: 30 days
    when: always
  cache:
    key: "clamav-scanned-jobs"
    paths:
      - clamav_scanned_images.json
    policy: pull-push
  rules:
    - if: '$CI_MERGE_REQUEST_EVENT_TYPE == "merge_train" && $CI_PROJECT_PATH_SLUG == "r2devops-hub"'

child_job_av:
  stage: merge_tests
  trigger:
    include:
      - artifact: generated-gitlab-ci.yml
        job: generate_job_av
    strategy: depend
  rules:
    - if: '$CI_MERGE_REQUEST_EVENT_TYPE == "merge_train" && $CI_PROJECT_PATH_SLUG == "r2devops-hub"'

tools/job_av/Pipfile

0 → 100644
+10 −0
Original line number Diff line number Diff line
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true

[requires]
python_version = "3"

[packages]
pyyaml = "==5.3.1"
+52 −0
Original line number Diff line number Diff line
stages:
  - static_tests

workflow:
  rules:
    - if: $CI_MERGE_REQUEST_IID
    - if: $CI_COMMIT_TAG
    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH

job_av:
  image: docker:19.03
  stage: static_tests
  services:
    - name: docker:19.03-dind
      entrypoint: ["env", "-u", "DOCKER_HOST"]
      command: ["dockerd-entrypoint.sh"]
  variables:
    PIPENV_PIPFILE: tools/job_image/Pipfile
    AV_LOGFILE: "clamav.log"

    DOCKER_HOST: tcp://docker:2375
    DOCKER_DRIVER: overlay2
    DOCKER_TLS_CERTDIR: ""
  script:
    - apk add --no-cache python3 py3-pip clamav util-linux
    - pip install --ignore-installed distlib pipenv
    - pipenv install
    # Moving cache into the proper directory
    # See .gitlab-ci.yml for more info about it in `job_av`
    - mkdir -p /var/lib/clamav
    - cp freshclam_db/* /var/lib/clamav/
    # Add cut of `@` for specific image tags (eg: `lighthouse@sha...`)
    - JOB=$(basename ${IMAGE} | cut -d':' -f1 | cut -d'@' -f1)
    - docker create --name ${JOB} ${IMAGE}
    - docker export ${JOB} > ${JOB}.tar
    - mkdir ${JOB}
    - tar -xf ${JOB}.tar -C ${JOB}
    - clamscan -ir ${JOB} --max-filesize=50M > ${JOB}-scan.log 2>&1
    - cat ${JOB}-scan.log
  artifacts:
    paths:
      - "*-scan.log"
    expire_in: 30 days
    when: always
  cache:
    key: "clamav-db"
    paths:
      - freshclam_db
    policy: pull
  parallel:
    matrix:
      - IMAGE: []
 No newline at end of file

tools/job_av/job_av.py

0 → 100644
+164 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3

import os
import logging
import json
import sys
import copy
import argparse
import yaml
from tools.job_image.job_image import get_image

# Import the Config module and set the path to run the script from root project
# /!\ This instruction is only working if you run this script from the root of the project
sys.path.insert(0, "./")
from tools.utils.utils import Config

utils = Config()

# Job variables
CI_FILENAME = os.getenv("GENERATED_YAML")
PARALLEL_LIMIT = 50

BASE_FILE = "base-gitlab-ci.yml"
JOB_DIR = "job_av"

SCANNED_IMAGES_FILE = os.getenv("SCANNED_IMAGES_FILE")
SCANNED_IMAGES = []

BLACKLIST = ["github/super-linter:v3.14.3", "shiftleft/sast-scan:v1.9.29"]


def argparse_setup():
    """Setup argparse

    Return
    ------
    obj
        Python object with arguments parsed
    """
    parser = argparse.ArgumentParser()
    return parser.parse_args()


def get_scanned_images():
    """Fetch already scanned images by
    the schedule pipeline

    :return: string[] with a list of scanned images
    """
    try:
        with open(SCANNED_IMAGES_FILE, "r") as images_file:
            scanned_image = json.load(images_file)
            logging.info("Scanned images list: %s", scanned_image)
            return scanned_image
    except OSError:
        logging.warning("There isn't any file containing already scanned images")
        with open(SCANNED_IMAGES_FILE, "w") as new_file:
            json.dump([], new_file)
            return []


def set_default_image(current_ci):
    """When there is no image to scan
    We replace the base-gitlab-ci.yml into a
    simple child pipeline job, saying "nothing to run"

    Based on test this "default job" lasts 20 seconds.
    """
    current_ci["job_av"]["script"] = []
    current_ci["job_av"]["script"].append("echo 'There is not any image to scan, finishing...'")

    current_ci["job_av"].pop("services")
    current_ci["job_av"].pop("artifacts")
    current_ci["job_av"].pop("cache")
    current_ci["job_av"].pop("parallel")
    current_ci["job_av"]["image"] = "alpine:3.13.2"

    logging.info("No image to scan, putting default useless child job")

def save_scanned_images(already_scanned):
    """Save the scanned images into
    needed file

    :param scanned_images: array with a list of image
    :return:
    """
    try:
        with open(SCANNED_IMAGES_FILE, 'w') as images_file:
            json.dump(already_scanned, images_file)

        logging.info("Saved successfully scanned images in this pipeline")
        logging.info("There is %s images now scanned", len(already_scanned))
    except OSError as error:
        logging.error("Failed to write new scanned images file %s", error)
        sys.exit(1)


if __name__ == "__main__":
    """Main function, iterate over the jobs to get their image
    and write a .gitlab-ci.yml that can run a child pipeline
    in order to use ClamAV for virus detection

    Returns
    -------
    0
        On success
    1
        On error
    """
    # Setup argparse
    args = argparse_setup()

    # Setup logging
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[
            logging.FileHandler(utils.LOGFILE_NAME),
            logging.StreamHandler()
        ]
    )

    logging.info("Getting all images from every job in the hub")
    jobs = os.listdir(utils.JOBS_DIR)
    scanned_images = get_scanned_images()
    images = [
    ]

    for job in jobs:
        image = get_image(job)
        logging.info("Checking %s image", image)
        if (image is not None and image not in images
            and image not in BLACKLIST and image not in scanned_images):
            logging.info("Adding %s image to list of images to scan", image)
            images.append(image)

    logging.info("There is %s images to scan in this pipeline", len(images))

    image_chunks = [images[x: x + PARALLEL_LIMIT] for x in range(0, len(images), PARALLEL_LIMIT)]
    save_scanned_images(images + scanned_images)

    ci_file = {}
    with open(f"{utils.TOOLS_DIR}/{JOB_DIR}/{BASE_FILE}") as file:
        ci_file = yaml.load(file, Loader=yaml.FullLoader)

    logging.info("Creating jobs for each chunks of %i images to run in parallel", PARALLEL_LIMIT)
    for chunk_index, chunk in enumerate(image_chunks):
        if chunk_index == 0:
            for image_index, image in enumerate(chunk):
                logging.info("Adding image %s to job n°%i", image, chunk_index)
                ci_file["job_av"]["parallel"]["matrix"][0]["IMAGE"].append(image)
        else:
            ci_file[f"job_av_{chunk_index}"] = copy.deepcopy(ci_file["job_av"])
            ci_file[f"job_av_{chunk_index}"]["parallel"]["matrix"][0]["IMAGE"] = []
            for image_index, image in enumerate(chunk):
                logging.info("Adding image %s to job n°%i", image, chunk_index)
                ci_file[f"job_av_{chunk_index}"]["parallel"]["matrix"][0]["IMAGE"].append(image)

    if not images:
        set_default_image(ci_file)

    logging.info("Writing %s file", CI_FILENAME)
    with open(CI_FILENAME, "w+") as file:
        yaml.dump(ci_file, file, sort_keys=False)
+29 −0
Original line number Diff line number Diff line
@@ -24,6 +24,34 @@ def argparse_setup():
    parser.add_argument("job", help="job name to get the image from")
    return parser.parse_args()

def get_image(job):
    """Get the image of a job

    Parameters
    ----------
    job
        Job name

    Return
    ------
    str
        The string of the image of the job, or empty
    """
    logging.info(f"Getting the image for job {job}")

    with open(f"{utils.JOBS_DIR}/{job}/{job}.yml", 'r') as file:
        data = yaml.load(file, Loader=yaml.FullLoader)
        if "image" in data[job].keys():
            if isinstance(data[job]['image'], dict):
                return data[job]['image']['name']
            else:
                return data[job]['image']
        elif "extends" in data[job].keys():
            if isinstance(data[data[job]['extends']]['image'], dict):
                return data[data[job]['extends']]['image']['name']
            else:
                return data[data[job]['extends']]['image']

if __name__ == "__main__":
    """Main function, get the name of the image for a job

@@ -81,3 +109,4 @@ if __name__ == "__main__":
        except KeyError :
            logging.warning('The job %s doesn\'t declare its image and extends a job from outside of the file, we aren\'t able to check its image vulnerabilities', args.job)
            # TODO: check images from included jobs ==> https://gitlab.com/r2devops/hub/-/issues/282