Commit 04a5efc1 authored by Gaëtan Montury's avatar Gaëtan Montury
Browse files

Merge branch 'feat/semgrep' into 'main'

feat: add SAST tool semgrep

Closes #132

See merge request to-be-continuous/python!194
parents 260e96b6 2491f62b
Loading
Loading
Loading
Loading
+28 −0
Original line number Diff line number Diff line
@@ -333,6 +333,34 @@ In addition to a textual report in the console, this job produces the following
| `$PYTHON_PROJECT_DIR/reports/py-trivy.gitlab-codequality.json` | [Trivy report format for GitLab Code Quality](https://trivy.dev/docs/latest/tutorials/integrations/gitlab-ci/) format       | [GitLab Code Quality](https://docs.gitlab.com/ci/yaml/artifacts_reports/#artifactsreportscodequality)  |
| `$PYTHON_PROJECT_DIR/reports/py-trivy.gitlab-sast.json` | [Trivy report format for GitLab SAST](https://trivy.dev/docs/latest/tutorials/integrations/gitlab-ci/) format       | [GitLab SAST](https://docs.gitlab.com/ci/yaml/artifacts_reports/#artifactsreportssast)  |

### `py-semgrep` job

This job performs a [Semgrep](https://semgrep.dev/docs/) analysis.

It is bound to the `test` stage, and uses the following variables:

| Input / Variable | Description | Default Value |
| ---------------- | ----------- | ------------- |
| `semgrep-disabled` / `PYTHON_SEMGREP_DISABLED` | Set to `true` to disable this job | _none_ (enabled) |
| `semgrep-image` / `PYTHON_SEMGREP_IMAGE` | The Docker image used to run [Semgrep](https://semgrep.dev/docs/) | `docker.io/semgrep/semgrep:latest` <br/>[![Trivy Badge](https://to-be-continuous.gitlab.io/doc/secu/trivy-badge-DOTNET_SEMGREP_IMAGE.svg)](https://to-be-continuous.gitlab.io/doc/secu/trivy-DOTNET_SEMGREP_IMAGE) |
| `semgrep-args` / `PYTHON_SEMGREP_ARGS` | Semgrep [scan options](https://semgrep.dev/docs/cli-reference#semgrep-scan-command-options) | --metrics off --disable-version-check --error .` |
| `semgrep-rules` / `PYTHON_SEMGREP_RULES` | Space-separated list of [Semgrep rules](https://semgrep.dev/docs/running-rules).<br/>Can be both local YAML files or remote rules from the [Segmrep Registry](https://semgrep.dev/explore) (denoted by the `p/` prefix). | `p/python p/bandit p/gitlab-bandit p/owasp-top-ten p/security-audit` |
| `semgrep-download-rules-enabled` / `PYTHON_SEMGREP_DOWNLOAD_RULES_ENABLED` | Download Semgrep remote rules | `true` |

> :information_source: Semgrep may [collect some metrics](https://semgrep.dev/docs/metrics), especially when using rules from the Semgrep Registry.
> To protect your privacy and let you run Semgrep in air-gap environments, this template disables all Semgrep metrics by default:
>
> * rules from the Semgrep registry are pre-downloaded and passed to Semgrep as local rule files (can be disabled by setting `semgrep-download-rules-enabled` / `PYTHON_SEMGREP_DOWNLOAD_RULES_ENABLED` to `false`),
> * the `--metrics` option is set to `off`,
> * the `--disable-version-check` option is set.

In addition to a textual report in the console, this job produces the following reports, kept for one week:

| Report | Format | Usage |
| ------ | ------ | ----- |
| `$PYTHON_PROJECT_DIR/reports/py-semgrep.gitlab-sast.json` | [GitLab's SAST format](https://semgrep.dev/docs/cli-reference#semgrep-scan-command-options) | [GitLab integration](https://docs.gitlab.com/ci/yaml/artifacts_reports/#artifactsreportssast) |
| `$PYTHON_PROJECT_DIR/reports/py-semgrep.native.json` | [Semgrep's JSON format](https://semgrep.dev/docs/cli-reference#semgrep-scan-command-options) |  |


### `py-sbom` job

+29 −0
Original line number Diff line number Diff line
@@ -369,6 +369,35 @@
          "advanced": true
        }
      ]
    },
    {
      "id": "semgrep",
      "name": "Semgrep",
      "description": "[Semgrep](https://semgrep.dev/docs/) analysis",
      "disable_with": "PYTHON_SEMGREP_DISABLED",
      "variables": [
        {
          "name": "PYTHON_SEMGREP_IMAGE",
          "description": "The Docker image used to run [Semgrep](https://semgrep.dev/docs/)",
          "default": "docker.io/semgrep/semgrep:latest"
        },
        {
          "name": "PYTHON_SEMGREP_ARGS",
          "description": "Semgrep [scan options](https://semgrep.dev/docs/cli-reference#semgrep-scan-command-options)",
          "default": "--metrics off --disable-version-check --error ."
        },
        {
          "name": "PYTHON_SEMGREP_RULES",
          "description": "Space-separated list of [Semgrep rules](https://semgrep.dev/docs/running-rules).\n\nCan be both local YAML files or remote rules from the [Semgrep Registry](https://semgrep.dev/explore) (denoted by the `p/` prefix)",
          "default": "p/python p/bandit p/gitlab-bandit p/owasp-top-ten p/security-audit"
        },
        {
          "name": "PYTHON_SEMGREP_DOWNLOAD_RULES_ENABLED",
          "description": "Download Semgrep remote rules",
          "type": "boolean",
          "default": "true"
        }
      ]
    }
  ],
  "variants": [
+155 −0
Original line number Diff line number Diff line
@@ -230,6 +230,27 @@ spec:
    ty-args:
      description: Additional [ty CLI options](https://docs.astral.sh/ty/reference/cli/#ty-check)
      default: ""
    semgrep-disabled:
      description: Disable Semgrep
      type: boolean
      default: false
    semgrep-image:
      description: The Docker image used to run [Semgrep](https://semgrep.dev/docs/)
      default: docker.io/semgrep/semgrep:latest
    semgrep-args:
      description: Semgrep [scan options](https://semgrep.dev/docs/cli-reference#semgrep-scan-command-options)
      default: --metrics off --disable-version-check --error .
    semgrep-rules:
      description: |-
        Space-separeted list of [Semgrep rules](https://semgrep.dev/docs/running-rules).

        Can be both local YAML files or remote rules from the [Semgrep Registry](https://semgrep.dev/explore) (denoted by the `p/` prefix)
      default: p/python p/bandit p/gitlab-bandit p/owasp-top-ten p/security-audit
    semgrep-download-rules-enabled:
      description: Download Semgrep remote rules
      type: boolean
      default: true

---
# default workflow rules: Merge Request pipelines
.tbc-workflow-rules:
@@ -368,21 +389,38 @@ variables:
  PYTHON_PUBLISH_ENABLED: $[[ inputs.publish-enabled ]]
  PYTHON_AUTO_RELEASE_ENABLED: $[[ inputs.auto-release-enabled ]]

  # Black
  PYTHON_BLACK_ENABLED: $[[ inputs.black-enabled ]]
  PYTHON_BLACK_ARGS: $[[ inputs.black-args ]]

  # Isort
  PYTHON_ISORT_ENABLED: $[[ inputs.isort-enabled ]]
  PYTHON_ISORT_ARGS: $[[ inputs.isort-args ]]

  # Ruff
  RUFF_ENABLED: $[[ inputs.ruff-enabled ]]
  RUFF_ARGS: $[[ inputs.ruff-args ]]
  RUFF_FORMAT_ENABLED: $[[ inputs.ruff-format-enabled ]]

  # Mypy
  MYPY_ENABLED: $[[ inputs.mypy-enabled ]]
  MYPY_ARGS: $[[ inputs.mypy-args ]]

  # Pyright / basedpyright
  PYRIGHT_ENABLED: $[[ inputs.pyright-enabled ]]
  PYRIGHT_ARGS: $[[ inputs.pyright-args ]]
  PYRIGHT_LEVEL: $[[ inputs.pyright-level ]]

  # Ty
  TY_ENABLED: $[[ inputs.ty-enabled ]]
  TY_ARGS: $[[ inputs.ty-args ]]

  # Semgrep
  PYTHON_SEMGREP_DISABLED: $[[ inputs.semgrep-disabled ]]
  PYTHON_SEMGREP_IMAGE: $[[ inputs.semgrep-image ]]
  PYTHON_SEMGREP_ARGS: $[[ inputs.semgrep-args ]]
  PYTHON_SEMGREP_RULES: $[[ inputs.semgrep-rules ]]
  PYTHON_SEMGREP_DOWNLOAD_RULES_ENABLED: $[[ inputs.semgrep-download-rules-enabled ]]

.python-scripts: &python-scripts |
  # BEGSCRIPT
@@ -1331,6 +1369,89 @@ variables:
    python3 -c 'import urllib.request;urllib.request.urlretrieve("'"$url"'","'"$output"'")'
  }

  function semgrep_download_rules_py() {
    python3 << 'EOF'
  import os, json, sys
  from pathlib import Path
  from urllib.request import Request, urlopen
  from urllib.error import HTTPError, URLError

  cache = Path(os.getenv("SEMGREP_CACHE_DIR"))
  cache.mkdir(exist_ok=True)
  metadata_file = cache / ".semgrep-cache-metadata.json"
  cache_metadata_prev = json.loads(metadata_file.read_text()) if metadata_file.exists() else {}
  cache_metadata = {}

  for rule in os.getenv("PYTHON_SEMGREP_RULES", "").split():
      if Path(rule).is_file():
          cache_metadata_prev[rule] = {"file": str(Path(rule).resolve())}
      else:
          dest = cache / f"semgrep-{rule.replace('/', '-')}.yml"
          req = Request(f"https://semgrep.dev/c/{rule}")
          dest.exists() and cache_metadata_prev.get(rule, {}).get("etag") and req.add_header("If-None-Match", cache_metadata_prev[rule]["etag"])
          cache_metadata[rule] = {"file": str(dest), "etag": cache_metadata_prev.get(rule, {}).get("etag")}
          try:
              with urlopen(req, timeout=30) as resp:
                  dest.write_bytes(resp.read())
                  cache_metadata.setdefault(rule, {}).update({"exist": True, "use_cache": False})
                  (tag := resp.headers.get("ETag")) and cache_metadata[rule].update({"etag": tag})
          except HTTPError as e:
              if e.code == 304:
                  cache_metadata.setdefault(rule, {}).update({"exist": dest.exists(), "use_cache": True})
              else:
                  cache_metadata.setdefault(rule, {}).update({"exist": dest.exists(), "error": str(e)})
          except URLError as e:
              cache_metadata.setdefault(rule, {}).update({"exist": dest.exists(), "error": str(e)})

  metadata_file.write_text(json.dumps(cache_metadata, indent=2))
  print(" ".join([v["file"] for v in cache_metadata.values() if v.get("exist") and not v.get("error")]))
  downloaded, from_cache, errors = (sum(1 for v in cache_metadata.values() if v.get("use_cache") == False), sum(1 for v in cache_metadata.values() if v.get("use_cache") == True), sum(1 for v in cache_metadata.values() if "error" in v))
  print(f"[\x1b[1;94mINFO\x1b[0m] Semgrep rules download summary: total={downloaded + from_cache + errors} downloaded={downloaded} from_cache={from_cache} errors={errors}", file=sys.stderr)
  EOF
  }

  # match a value against a GitLab CI regex pattern (strips surrounding slashes)
  match_ref() { pattern=${2:1:-1}; [[ "$1" =~ $pattern ]]; }

  function semgrep_run_py() {
      export SEMGREP_RULES="$PYTHON_SEMGREP_RULES"
      if [[ "$PYTHON_SEMGREP_DOWNLOAD_RULES_ENABLED" == "true" ]]; then
          log_info "Download Semgrep rules..."
          SEMGREP_RULES=$(semgrep_download_rules_py)
      fi
      log_info "Using Semgrep rules $(echo "${SEMGREP_RULES:-}" | xargs -n1 basename 2>/dev/null | xargs)"

      semgrep_reports="--gitlab-sast-output=${PYTHON_PROJECT_DIR}/reports/py-semgrep.gitlab-sast.json  --json-output=${PYTHON_PROJECT_DIR}/reports/py-semgrep.native.json"

      if match_ref "$CI_COMMIT_REF_NAME" "$RELEASE_REF" || match_ref "$CI_COMMIT_REF_NAME" "$INTEG_REF" || match_ref "$CI_COMMIT_REF_NAME" "$PROD_REF"; then
          baseline_commit=""
      else
        if git fetch origin "${CI_DEFAULT_BRANCH}" --depth=50 2>/dev/null; then
          baseline_commit="$(git merge-base "origin/${CI_DEFAULT_BRANCH}" "${CI_COMMIT_SHA}" 2>/dev/null)" || true
        fi
        if [[ -z "${baseline_commit}" ]]; then
          log_info "No baseline found (${CI_DEFAULT_BRANCH}), falling back to full scan"
        fi        
      fi

      if [[ -z "${baseline_commit}" ]]; then
        log_info "Launch semgrep full scan: ${TRACE+--verbose} ${semgrep_reports} ${PYTHON_SEMGREP_ARGS}"
        # shellcheck disable=SC2086
        semgrep scan ${TRACE+--verbose} ${semgrep_reports} ${PYTHON_SEMGREP_ARGS} || semgrep_rc=$?
      else
        log_info "Launch semgrep differential scan from baseline-commit: --baseline-commit=${baseline_commit} ${TRACE+--verbose} ${semgrep_reports} ${PYTHON_SEMGREP_ARGS}"
        # shellcheck disable=SC2086
        semgrep scan --baseline-commit=${baseline_commit} ${TRACE+--verbose} ${semgrep_reports} ${PYTHON_SEMGREP_ARGS} || semgrep_rc=$?
      fi

      if [[ "${semgrep_rc:-0}" -ne 0 ]]; then
          fail "Semgrep scan detected issues in the codebase. Review the report at: ${PYTHON_PROJECT_DIR}/reports/py-semgrep.gitlab-sast.json"
      else
          log_info "Semgrep scan completed with no issues detected"
      fi
  }


  unscope_variables
  eval_all_secrets

@@ -1919,6 +2040,40 @@ py-sbom:
    # 'onrelease' mode: use common software delivery rules
    - !reference [.delivery-policy, rules]

py-semgrep:
  # this is little strange
  extends: .python-base 
  image: $PYTHON_SEMGREP_IMAGE
  stage: test
  variables:
    SEMGREP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/semgrep"
  before_script:
    - !reference [.python-scripts]
    - install_ca_certs "${CUSTOM_CA_CERTS:-$DEFAULT_CA_CERTS}"
    # Ignore cached files
    - mkdir -p $CI_PROJECT_DIR/.cache/semgrep && echo "*" >> $CI_PROJECT_DIR/.cache/.gitignore
    - cd ${PYTHON_PROJECT_DIR}
  script:
    - mkdir -p -m 777 reports
    - |
      if ! command -v semgrep > /dev/null; then
        fail "semgrep not found ! Please provide an image with semgrep installed."
      fi
    - semgrep_run_py
  artifacts:
    name: "$CI_JOB_NAME artifacts from $CI_PROJECT_NAME on $CI_COMMIT_REF_SLUG"
    expire_in: 1 day
    when: always
    reports:
      sast: "${PYTHON_PROJECT_DIR}/reports/py-semgrep.gitlab-sast.json"
    paths:
      - ${PYTHON_PROJECT_DIR}/reports/py-semgrep.*
  rules:
    # exclude if disabled
    - if: '$PYTHON_SEMGREP_DISABLED == "true"'
      when: never
    - !reference [.test-policy, rules]

# (manual from master branch): triggers a release (tag creation)
py-release:
  extends: .python-base