Merge pull request #9481 from Lyndon-Li/issue-fix-9478

Issue 9478: Diagnose expose on peek error
Merge pull request #9487 from Lyndon-Li/issue-fix-for-cache-volume
2026-01-19 11:12:52 +00:00 · 2026-01-15 16:53:57 +08:00 · 2026-01-15 11:43:36 +08:00 · 2026-01-14 17:45:01 +08:00 · 2026-01-14 00:25:58 -05:00 · 2026-01-13 16:33:14 +08:00
716 changed files with 59147 additions and 11248 deletions
--- a/.github/workflows/e2e-test-kind.yaml
+++ b/.github/workflows/e2e-test-kind.yaml
@@ -8,18 +8,26 @@ on:
      - "design/**"
      - "**/*.md"
 jobs:
+  get-go-version:
+    uses: ./.github/workflows/get-go-version.yaml
+    with:
+      ref: ${{ github.event.pull_request.base.ref }}
+
  # Build the Velero CLI and image once for all Kubernetes versions, and cache it so the fan-out workers can get it.
  build:
    runs-on: ubuntu-latest
+    needs: get-go-version
    outputs:
      minio-dockerfile-sha: ${{ steps.minio-version.outputs.dockerfile_sha }}
    steps:
      - name: Check out the code
-        uses: actions/checkout@v4
-      - name: Set up Go
-        uses: actions/setup-go@v5
+        uses: actions/checkout@v6
+      
+      - name: Set up Go version
+        uses: actions/setup-go@v6
        with:
-          go-version-file: 'go.mod'
+          go-version: ${{ needs.get-go-version.outputs.version }}
+
      # Look for a CLI that's made for this PR
      - name: Fetch built CLI
        id: cli-cache
@@ -97,17 +105,20 @@ jobs:
    needs:
      - build
      - setup-test-matrix
+      - get-go-version
    runs-on: ubuntu-latest
    strategy:
      matrix: ${{fromJson(needs.setup-test-matrix.outputs.matrix)}}
      fail-fast: false
    steps:
      - name: Check out the code
-        uses: actions/checkout@v4
-      - name: Set up Go
-        uses: actions/setup-go@v5
+        uses: actions/checkout@v6
+
+      - name: Set up Go version
+        uses: actions/setup-go@v6
        with:
-          go-version-file: 'go.mod'
+          go-version: ${{ needs.get-go-version.outputs.version }}
+
      # Fetch the pre-built MinIO image from the build job
      - name: Fetch built MinIO Image
        uses: actions/cache@v4
@@ -174,7 +185,7 @@ jobs:
        timeout-minutes: 30
      - name: Upload debug bundle
        if: ${{ failure() }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v5
        with:
-          name: DebugBundle
+          name: DebugBundle-k8s-${{ matrix.k8s }}-job-${{ strategy.job-index }}
          path: /home/runner/work/velero/velero/test/e2e/debug-bundle*
--- a/.github/workflows/get-go-version.yaml
+++ b/.github/workflows/get-go-version.yaml
@@ -0,0 +1,33 @@
+on:
+  workflow_call:
+    inputs:
+      ref:
+        description: "The target branch's ref"
+        required: true
+        type: string
+    outputs:
+      version: 
+        description: "The expected Go version"
+        value: ${{ jobs.extract.outputs.version }}
+
+jobs:
+  extract:
+      runs-on: ubuntu-latest
+      outputs:
+        version: ${{ steps.pick-version.outputs.version }}
+      steps:
+        - name: Check out the code
+          uses: actions/checkout@v6
+
+        - id: pick-version
+          run: |
+            if [ "${{ inputs.ref }}" == "main" ]; then
+              version=$(grep '^go ' go.mod | awk '{print $2}' | cut -d. -f1-2)
+            else
+              goDirectiveVersion=$(grep '^go ' go.mod | awk '{print $2}')
+              toolChainVersion=$(grep '^toolchain ' go.mod | awk '{print $2}')
+              version=$(printf "%s\n%s\n" "$goDirectiveVersion" "$toolChainVersion" | sort -V | tail -n1)
+            fi
+
+            echo "version=$version"
+            echo "version=$version" >> $GITHUB_OUTPUT
--- a/.github/workflows/nightly-trivy-scan.yml
+++ b/.github/workflows/nightly-trivy-scan.yml
@@ -13,13 +13,13 @@ jobs:
        # maintain the versions of Velero those need security scan
        versions: [main]
        # list of images that need scan
-        images: [velero, velero-restore-helper]
+        images: [velero, velero-plugin-for-aws, velero-plugin-for-gcp, velero-plugin-for-microsoft-azure]
    permissions:
      security-events: write  # for github/codeql-action/upload-sarif to upload SARIF results

    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6

      - name: Run Trivy vulnerability scanner
        uses: aquasecurity/trivy-action@master
--- a/.github/workflows/pr-changelog-check.yml
+++ b/.github/workflows/pr-changelog-check.yml
@@ -12,7 +12,7 @@ jobs:
    steps:

    - name: Check out the code
-      uses: actions/checkout@v4
+      uses: actions/checkout@v6

    - name: Changelog check
      if: ${{ !(contains(github.event.pull_request.labels.*.name, 'kind/changelog-not-required') || contains(github.event.pull_request.labels.*.name, 'Design') || contains(github.event.pull_request.labels.*.name, 'Website') || contains(github.event.pull_request.labels.*.name, 'Documentation'))}}
--- a/.github/workflows/pr-ci-check.yml
+++ b/.github/workflows/pr-ci-check.yml
@@ -1,18 +1,26 @@
 name: Pull Request CI Check
 on: [pull_request]
 jobs:
+  get-go-version:
+    uses: ./.github/workflows/get-go-version.yaml
+    with:
+      ref: ${{ github.event.pull_request.base.ref }}
+
  build:
    name: Run CI
+    needs: get-go-version
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
    steps:
      - name: Check out the code
-        uses: actions/checkout@v4
-      - name: Set up Go
-        uses: actions/setup-go@v5
+        uses: actions/checkout@v6
+
+      - name: Set up Go version
+        uses: actions/setup-go@v6
        with:
-          go-version-file: 'go.mod'
+          go-version: ${{ needs.get-go-version.outputs.version }}      
+
      - name: Make ci
        run: make ci
      - name: Upload test coverage
--- a/.github/workflows/pr-codespell.yml
+++ b/.github/workflows/pr-codespell.yml
@@ -8,14 +8,14 @@ jobs:
    steps:

    - name: Check out the code
-      uses: actions/checkout@v4
+      uses: actions/checkout@v6

    - name: Codespell
      uses: codespell-project/actions-codespell@master
      with:
        # ignore the config/.../crd.go file as it's generated binary data that is edited elsewhere.
        skip: .git,*.png,*.jpg,*.woff,*.ttf,*.gif,*.ico,./config/crd/v1beta1/crds/crds.go,./config/crd/v1/crds/crds.go,./config/crd/v2alpha1/crds/crds.go,./go.sum,./LICENSE
-        ignore_words_list: iam,aks,ist,bridget,ue,shouldnot,atleast,notin,sme,optin
+        ignore_words_list: iam,aks,ist,bridget,ue,shouldnot,atleast,notin,sme,optin,sie
        check_filenames: true
        check_hidden: true

--- a/.github/workflows/pr-containers.yml
+++ b/.github/workflows/pr-containers.yml
@@ -13,7 +13,7 @@ jobs:
    name: Build
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
      name: Checkout

    - name: Set up QEMU
--- a/.github/workflows/pr-goreleaser.yml
+++ b/.github/workflows/pr-goreleaser.yml
@@ -14,7 +14,7 @@ jobs:
    name: Build
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
      name: Checkout

    - name: Verify .goreleaser.yml and try a dryrun release.
--- a/.github/workflows/pr-linter-check.yml
+++ b/.github/workflows/pr-linter-check.yml
@@ -7,18 +7,26 @@ on:
      - "design/**"
      - "**/*.md"
 jobs:
+  get-go-version:
+    uses: ./.github/workflows/get-go-version.yaml
+    with:
+      ref: ${{ github.event.pull_request.base.ref }}
+
  build:
    name: Run Linter Check
    runs-on: ubuntu-latest
+    needs: get-go-version
    steps:
      - name: Check out the code
-        uses: actions/checkout@v4
-      - name: Set up Go
-        uses: actions/setup-go@v5
+        uses: actions/checkout@v6
+
+      - name: Set up Go version
+        uses: actions/setup-go@v6
        with:
-          go-version-file: 'go.mod'
+          go-version: ${{ needs.get-go-version.outputs.version }}
+
      - name: Linter check
-        uses: golangci/golangci-lint-action@v6
+        uses: golangci/golangci-lint-action@v9
        with:
-          version: v1.64.5
+          version: v2.5.0
          args: --verbose
--- a/.github/workflows/push-builder.yml
+++ b/.github/workflows/push-builder.yml
@@ -12,7 +12,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:

-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v6
      with:
        # The default value is "1" which fetches only a single commit. If we merge PR without squash or rebase,
        # there are at least two commits: the first one is the merge commit and the second one is the real commit
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -9,17 +9,24 @@ on:
      - '*'

 jobs:
+  get-go-version:
+    uses: ./.github/workflows/get-go-version.yaml
+    with:
+      ref: ${{ github.ref_name }}

  build:
    name: Build
    runs-on: ubuntu-latest
+    needs: get-go-version
    steps:
      - name: Check out the code
-        uses: actions/checkout@v4
-      - name: Set up Go
-        uses: actions/setup-go@v5
+        uses: actions/checkout@v6
+
+      - name: Set up Go version
+        uses: actions/setup-go@v6
        with:
-          go-version-file: 'go.mod'
+          go-version: ${{ needs.get-go-version.outputs.version }}
+
      - name: Set up QEMU
        id: qemu
        uses: docker/setup-qemu-action@v3
--- a/.github/workflows/rebase.yml
+++ b/.github/workflows/rebase.yml
@@ -9,7 +9,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - name: Checkout the latest code
-      uses: actions/checkout@v4
+      uses: actions/checkout@v6
      with:
        fetch-depth: 0
    - name: Automatic Rebase
--- a/.github/workflows/stale-issues.yml
+++ b/.github/workflows/stale-issues.yml
@@ -7,7 +7,7 @@ jobs:
  stale:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@v9.1.0
+      - uses: actions/stale@v10.1.1
        with:
          repo-token: ${{ secrets.GITHUB_TOKEN }}
          stale-issue-message: "This issue is stale because it has been open 60 days with no activity. Remove stale label or comment or this will be closed in 14 days. If a Velero team member has requested log or more information, please provide the output of the shared commands."
--- a/.gitignore
+++ b/.gitignore
@@ -58,3 +58,8 @@ debug.test*

 # make lint cache
 .cache/
+
+# Go telemetry directory created when container sets HOME to working directory
+# This happens because Makefile uses 'docker run -w /github.com/vmware-tanzu/velero'
+# and Go's os.UserConfigDir() falls back to $HOME/.config when XDG_CONFIG_HOME is unset
+.config/
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -6,7 +6,7 @@ run:
  # default concurrency is a available CPU number
  concurrency: 4

-  # timeout for analysis, e.g. 30s, 5m, default is 1m
+  # timeout for analysis, e.g. 30s, 5m, default is 0
  timeout: 20m

  # exit code when at least one issue was found, default is 1
@@ -29,293 +29,281 @@ run:

 # output configuration options
 output:
-  # colored-line-number|line-number|json|tab|checkstyle|code-climate, default is "colored-line-number"
  formats:
-    - format: colored-line-number
+    text:
      path: stdout

-  # print lines of code with issue, default is true
-  print-issued-lines: true
+      # print lines of code with issue, default is true
+      print-issued-lines: true

-  # print linter name in the end of issue text, default is true
-  print-linter-name: true
+      # print linter name in the end of issue text, default is true
+      print-linter-name: true

-# all available settings of specific linters
-linters-settings:
-
-  depguard:
-    rules:
-      main:
-        deny:
-          # specify an error message to output when a denylisted package is used
-          - pkg: github.com/sirupsen/logrus
-            desc: "logging is allowed only by logutils.Log"
-
-  dogsled:
-    # checks assignments with too many blank identifiers; default is 2
-    max-blank-identifiers: 2
-
-  dupl:
-    # tokens count to trigger issue, 150 by default
-    threshold: 100
-
-  errcheck:
-    # report about not checking of errors in type assertions: `a := b.(MyStruct)`;
-    # default is false: such cases aren't reported by default.
-    check-type-assertions: false
-
-    # report about assignment of errors to blank identifier: `num, _ := strconv.Atoi(numStr)`;
-    # default is false: such cases aren't reported by default.
-    check-blank: false
-
-    # [deprecated] comma-separated list of pairs of the form pkg:regex
-    # the regex is used to ignore names within pkg. (default "fmt:.*").
-    # see https://github.com/kisielk/errcheck#the-deprecated-method for details
-    # ignore: fmt:.*,io/ioutil:^Read.*
-
-    # path to a file containing a list of functions to exclude from checking
-    # see https://github.com/kisielk/errcheck#excluding-functions for details
-    # exclude: /path/to/file.txt
-
-  exhaustive:
-    # indicates that switch statements are to be considered exhaustive if a
-    # 'default' case is present, even if all enum members aren't listed in the
-    # switch
-    default-signifies-exhaustive: false
-
-  funlen:
-    lines: 60
-    statements: 40
-
-  gocognit:
-    # minimal code complexity to report, 30 by default (but we recommend 10-20)
-    min-complexity: 10
-
-  nestif:
-    # minimal complexity of if statements to report, 5 by default
-    min-complexity: 4
-
-  goconst:
-    # minimal length of string constant, 3 by default
-    min-len: 3
-    # minimal occurrences count to trigger, 3 by default
-    min-occurrences: 5
-
-  gocritic:
-    # Which checks should be enabled; can't be combined with 'disabled-checks';
-    # See https://go-critic.github.io/overview#checks-overview
-    # To check which checks are enabled run `GL_DEBUG=gocritic golangci-lint run`
-    # By default list of stable checks is used.
-    # enabled-checks:
-    #  - rangeValCopy
-
-    # Which checks should be disabled; can't be combined with 'enabled-checks'; default is empty
-    # disabled-checks:
-    #  - regexpMust
-
-    # Enable multiple checks by tags, run `GL_DEBUG=gocritic golangci-lint run` to see all tags and checks.
-    # Empty list by default. See https://github.com/go-critic/go-critic#usage -> section "Tags".
-    # enabled-tags:
-    #  - performance
-    # disabled-tags:
-    #  - experimental
-
-    settings: # settings passed to gocritic
-      captLocal: # must be valid enabled check name
-        paramsOnly: true
-    #  rangeValCopy:
-    #    sizeThreshold: 32
-
-  gocyclo:
-    # minimal code complexity to report, 30 by default (but we recommend 10-20)
-    min-complexity: 10
-
-  godot:
-    # check all top-level comments, not only declarations
-    check-all: false
-
-  godox:
-    # report any comments starting with keywords, this is useful for TODO or FIXME comments that
-    # might be left in the code accidentally and should be resolved before merging
-    keywords: # default keywords are TODO, BUG, and FIXME, these can be overwritten by this setting
-      - NOTE
-      - OPTIMIZE # marks code that should be optimized before merging
-      - HACK # marks hack-arounds that should be removed before merging
-
-  gofmt:
-    # simplify code: gofmt with `-s` option, true by default
-    simplify: true
-
-  goimports:
-    # put imports beginning with prefix after 3rd-party packages;
-    # it's a comma-separated list of prefixes
-    local-prefixes: github.com/org/project
-
-  gosec:
-    excludes:
-      - G115
-
-  govet:
-    # report about shadowed variables
-    # check-shadowing: true
-
-    # settings per analyzer
-    settings:
-      printf: # analyzer name, run `go tool vet help` to see all analyzers
-        funcs: # run `go tool vet help printf` to see available settings for `printf` analyzer
-          - (github.com/golangci/golangci-lint/pkg/logutils.Log).Infof
-          - (github.com/golangci/golangci-lint/pkg/logutils.Log).Warnf
-          - (github.com/golangci/golangci-lint/pkg/logutils.Log).Errorf
-          - (github.com/golangci/golangci-lint/pkg/logutils.Log).Fatalf
-
-    # enable or disable analyzers by name
-    enable:
-      - atomicalign
-    enable-all: false
-    disable:
-      - shadow
-    disable-all: false
-
-  lll:
-    # max line length, lines longer will be reported. Default is 120.
-    # '\t' is counted as 1 character by default, and can be changed with the tab-width option
-    line-length: 120
-    # tab width in spaces. Default to 1.
-    tab-width: 1
-
-  misspell:
-    # Correct spellings using locale preferences for US or UK.
-    # Default is to use a neutral variety of English.
-    # Setting locale to US will correct the British spelling of 'colour' to 'color'.
-    locale: US
-    ignore-words:
-      - someword
-
-  nakedret:
-    # make an issue if func has more lines of code than this setting and it has naked returns; default is 30
-    max-func-lines: 30
-
-  prealloc:
-    # XXX: we don't recommend using this linter before doing performance profiling.
-    # For most programs usage of prealloc will be a premature optimization.
-
-    # Report preallocation suggestions only on simple loops that have no returns/breaks/continues/gotos in them.
-    # True by default.
-    simple: true
-    range-loops: true # Report preallocation suggestions on range loops, true by default
-    for-loops: false # Report preallocation suggestions on for loops, false by default
-
-  nolintlint:
-    # Enable to ensure that nolint directives are all used. Default is true.
-    allow-unused: false
-    # Exclude following linters from requiring an explanation.  Default is [].
-    allow-no-explanation: []
-    # Enable to require an explanation of nonzero length after each nolint directive. Default is false.
-    require-explanation: true
-    # Enable to require nolint directives to mention the specific linter being suppressed. Default is false.
-    require-specific: true
-
-  perfsprint:
-    strconcat: false
-    sprintf1: false
-    errorf: false
-    int-conversion: true
-
-  revive:
-    rules:
-      - name: blank-imports
-        disabled: true
-      - name: context-as-argument
-        disabled: true
-      - name: context-keys-type
-      - name: dot-imports
-        disabled: true
-      - name: early-return
-        disabled: true
-        arguments:
-          - "preserveScope"
-      - name: empty-block
-        disabled: true
-      - name: error-naming
-        disabled: true
-      - name: error-return
-        disabled: true
-      - name: error-strings
-        disabled: true
-      - name: errorf
-        disabled: true
-      - name: increment-decrement
-      - name: indent-error-flow
-        disabled: true
-      - name: range
-      - name: receiver-naming
-        disabled: true
-      - name: redefines-builtin-id
-        disabled: true
-      - name: superfluous-else
-        disabled: true
-        arguments:
-          - "preserveScope"
-      - name: time-naming
-      - name: unexported-return
-        disabled: true
-      - name: unnecessary-stmt
-      - name: unreachable-code
-      - name: unused-parameter
-        disabled: true
-      - name: use-any
-      - name: var-declaration
-      - name: var-naming
-        disabled: true
-
-  rowserrcheck:
-    packages:
-      - github.com/jmoiron/sqlx
-
-  testifylint:
-      # TODO: enable them all
-      disable:
-        - go-require
-        - float-compare
-        - require-error
-      enable-all: true
-
-  testpackage:
-    # regexp pattern to skip files
-    skip-regexp: (export|internal)_test\.go
-  unparam:
-    # Inspect exported functions, default is false. Set to true if no external program/library imports your code.
-    # XXX: if you enable this setting, unparam will report a lot of false-positives in text editors:
-    # if it's called for subdir of a project it can't find external interfaces. All text editor integrations
-    # with golangci-lint call it on a directory with the changed file.
-    check-exported: false
-
-  whitespace:
-    multi-if: false   # Enforces newlines (or comments) after every multi-line if statement
-    multi-func: false # Enforces newlines (or comments) after every multi-line function signature
-
-  wsl:
-    # If true append is only allowed to be cuddled if appending value is
-    # matching variables, fields or types on line above. Default is true.
-    strict-append: true
-    # Allow calls and assignments to be cuddled as long as the lines have any
-    # matching variables, fields or types. Default is true.
-    allow-assign-and-call: true
-    # Allow multiline assignments to be cuddled. Default is true.
-    allow-multiline-assign: true
-    # Allow declarations (var) to be cuddled.
-    allow-cuddle-declarations: false
-    # Allow trailing comments in ending of blocks
-    allow-trailing-comment: false
-    # Force newlines in end of case at this limit (0 = never).
-    force-case-trailing-whitespace: 0
-    # Force cuddling of err checks with err var assignment
-    force-err-cuddling: false
-    # Allow leading comments to be separated with empty lines
-    allow-separated-leading-comment: false
+  # Show statistics per linter.      
+  show-stats: false

 linters:
-  disable-all: true
+  # all available settings of specific linters
+  settings:
+    depguard:
+      rules:
+        main:
+          deny:
+            # specify an error message to output when a denylisted package is used
+            - pkg: github.com/sirupsen/logrus
+              desc: "logging is allowed only by logutils.Log"
+
+    dogsled:
+      # checks assignments with too many blank identifiers; default is 2
+      max-blank-identifiers: 2
+
+    dupl:
+      # tokens count to trigger issue, 150 by default
+      threshold: 100
+
+    errcheck:
+      # report about not checking of errors in type assertions: `a := b.(MyStruct)`;
+      # default is false: such cases aren't reported by default.
+      check-type-assertions: false
+
+      # report about assignment of errors to blank identifier: `num, _ := strconv.Atoi(numStr)`;
+      # default is false: such cases aren't reported by default.
+      check-blank: false
+
+
+    exhaustive:
+      # indicates that switch statements are to be considered exhaustive if a
+      # 'default' case is present, even if all enum members aren't listed in the
+      # switch
+      default-signifies-exhaustive: false
+
+    funlen:
+      lines: 60
+      statements: 40
+
+    gocognit:
+      # minimal code complexity to report, 30 by default (but we recommend 10-20)
+      min-complexity: 10
+
+    nestif:
+      # minimal complexity of if statements to report, 5 by default
+      min-complexity: 4
+
+    goconst:
+      # minimal length of string constant, 3 by default
+      min-len: 3
+      # minimal occurrences count to trigger, 3 by default
+      min-occurrences: 5
+
+    gocritic:
+      # Which checks should be enabled; can't be combined with 'disabled-checks';
+      # See https://go-critic.github.io/overview#checks-overview
+      # To check which checks are enabled run `GL_DEBUG=gocritic golangci-lint run`
+      # By default list of stable checks is used.
+      settings: # settings passed to gocritic
+        captLocal: # must be valid enabled check name
+          paramsOnly: true
+
+    gocyclo:
+      # minimal code complexity to report, 30 by default (but we recommend 10-20)
+      min-complexity: 10
+
+    godot:
+      # check all top-level comments, not only declarations
+      check-all: false
+
+    godox:
+      # report any comments starting with keywords, this is useful for TODO or FIXME comments that
+      # might be left in the code accidentally and should be resolved before merging
+      keywords: # default keywords are TODO, BUG, and FIXME, these can be overwritten by this setting
+        - NOTE
+        - OPTIMIZE # marks code that should be optimized before merging
+        - HACK # marks hack-arounds that should be removed before merging
+
+    gosec:
+      excludes:
+        - G115
+
+    govet:
+      # enable or disable analyzers by name
+      enable:
+        - atomicalign
+      enable-all: false
+      disable:
+        - shadow
+      disable-all: false
+  
+    importas:
+       alias:
+        - alias: appsv1api
+          pkg: k8s.io/api/apps/v1
+        - alias: corev1api
+          pkg: k8s.io/api/core/v1
+        - alias: rbacv1
+          pkg: k8s.io/api/rbac/v1
+        - alias: apierrors
+          pkg: k8s.io/apimachinery/pkg/api/errors
+        - alias: apiextv1
+          pkg: k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1
+        - alias: metav1
+          pkg: k8s.io/apimachinery/pkg/apis/meta/v1
+        - alias: storagev1api
+          pkg: k8s.io/api/storage/v1
+        - alias: batchv1api
+          pkg: k8s.io/api/batch/v1
+
+    lll:
+    # max line length, lines longer will be reported. Default is 120.
+    # '\t' is counted as 1 character by default, and can be changed with the tab-width option
+      line-length: 120
+      # tab width in spaces. Default to 1.
+      tab-width: 1
+
+    misspell:
+      # Correct spellings using locale preferences for US or UK.
+      # Default is to use a neutral variety of English.
+      # Setting locale to US will correct the British spelling of 'colour' to 'color'.
+      locale: US
+      ignore-rules:
+        - someword
+
+    nakedret:
+      # make an issue if func has more lines of code than this setting and it has naked returns; default is 30
+      max-func-lines: 30
+
+    prealloc:
+      # XXX: we don't recommend using this linter before doing performance profiling.
+      # For most programs usage of prealloc will be a premature optimization.
+
+      # Report preallocation suggestions only on simple loops that have no returns/breaks/continues/gotos in them.
+      # True by default.
+      simple: true
+      range-loops: true # Report preallocation suggestions on range loops, true by default
+      for-loops: false # Report preallocation suggestions on for loops, false by default
+
+    nolintlint:
+      # Enable to ensure that nolint directives are all used. Default is true.
+      allow-unused: false
+      # Exclude following linters from requiring an explanation.  Default is [].
+      allow-no-explanation: []
+      # Enable to require an explanation of nonzero length after each nolint directive. Default is false.
+      require-explanation: true
+      # Enable to require nolint directives to mention the specific linter being suppressed. Default is false.
+      require-specific: true
+
+    perfsprint:
+      strconcat: false
+      sprintf1: false
+      errorf: false
+      int-conversion: true
+
+    revive:
+      rules:
+        - name: blank-imports
+          disabled: true
+        - name: context-as-argument
+          disabled: true
+        - name: context-keys-type
+        - name: dot-imports
+          disabled: true
+        - name: early-return
+          disabled: true
+          arguments:
+            - "preserveScope"
+        - name: empty-block
+          disabled: true
+        - name: error-naming
+          disabled: true
+        - name: error-return
+          disabled: true
+        - name: error-strings
+          disabled: true
+        - name: errorf
+          disabled: true
+        - name: increment-decrement
+        - name: indent-error-flow
+          disabled: true
+        - name: range
+        - name: receiver-naming
+          disabled: true
+        - name: redefines-builtin-id
+          disabled: true
+        - name: superfluous-else
+          disabled: true
+          arguments:
+            - "preserveScope"
+        - name: time-naming
+        - name: unexported-return
+          disabled: true
+        - name: unnecessary-stmt
+        - name: unreachable-code
+        - name: unused-parameter
+          disabled: true
+        - name: use-any
+        - name: var-declaration
+        - name: var-naming
+          disabled: true
+
+    rowserrcheck:
+      packages:
+        - github.com/jmoiron/sqlx
+
+    staticcheck:
+      checks:
+        - all
+        - -QF1001 # FIXME
+        - -QF1003 # FIXME
+        - -QF1004 # FIXME
+        - -QF1007 # FIXME
+        - -QF1008 # FIXME
+        - -QF1009 # FIXME
+        - -QF1012 # FIXME
+
+    testifylint:
+      # TODO: enable them all
+      disable:
+        - float-compare
+        - go-require
+      enable-all: true
+
+    testpackage:
+      # regexp pattern to skip files
+      skip-regexp: (export|internal)_test\.go
+    unparam:
+      # Inspect exported functions, default is false. Set to true if no external program/library imports your code.
+      # XXX: if you enable this setting, unparam will report a lot of false-positives in text editors:
+      # if it's called for subdir of a project it can't find external interfaces. All text editor integrations
+      # with golangci-lint call it on a directory with the changed file.
+      check-exported: false
+
+    usetesting:
+      os-setenv: false
+
+    whitespace:
+      multi-if: false # Enforces newlines (or comments) after every multi-line if statement
+      multi-func: false # Enforces newlines (or comments) after every multi-line function signature
+
+    wsl:
+      # If true append is only allowed to be cuddled if appending value is
+      # matching variables, fields or types on line above. Default is true.
+      strict-append: true
+      # Allow calls and assignments to be cuddled as long as the lines have any
+      # matching variables, fields or types. Default is true.
+      allow-assign-and-call: true
+      # Allow multiline assignments to be cuddled. Default is true.
+      allow-multiline-assign: true
+      # Allow declarations (var) to be cuddled.
+      allow-cuddle-declarations: false
+      # Allow trailing comments in ending of blocks
+      allow-trailing-comment: false
+      # Force newlines in end of case at this limit (0 = never).
+      force-case-trailing-whitespace: 0
+      # Force cuddling of err checks with err var assignment
+      force-err-cuddling: false
+      # Allow leading comments to be separated with empty lines
+      allow-separated-leading-comment: false
+
+  default: none
  enable:
    - asasalint
    - asciicheck
@@ -323,88 +311,87 @@ linters:
    - bodyclose
    - copyloopvar
    - dogsled
-    - durationcheck
    - dupword
+    - durationcheck
    - errcheck
    - errchkjson
+    - exptostd
+    - ginkgolinter
    - goconst
-    - gofmt
    - goheader
-    - goimports
    - goprintffuncname
    - gosec
-    - gosimple
    - govet
-    - ginkgolinter
    - importas
    - ineffassign
    - misspell
    - nakedret
-    - nosprintfhostport
    - nilerr
    - noctx
    - nolintlint
+    - nosprintfhostport
    - perfsprint
    - revive
    - staticcheck
-    - stylecheck
    - testifylint
    - thelper
-    - typecheck
    - unconvert
    - unparam
    - unused
    - usestdlibvars
+    - usetesting
    - whitespace
-  fast: false
+
+  exclusions:
+    # which dirs to skip: issues from them won't be reported;
+    # can use regexp here: generated.*, regexp is applied on full path;
+    # default value is empty list, but default dirs are skipped independently
+    # from this option's value (see skip-dirs-use-default).
+    # "/" will be replaced by current OS file path separator to properly work
+    # on Windows.
+    paths:
+      - pkg/plugin/generated/*
+      - third_party
+
+    rules:
+      - linters:
+          - staticcheck
+        text: "DefaultVolumesToRestic" # No need to report deprecate for DefaultVolumesToRestic.
+      - path: ".*_test.go$"
+        linters:
+          - errcheck
+          - goconst
+          - gosec
+          - govet
+          - staticcheck
+          - unparam
+          - unused
+      - path: test/
+        linters:
+          - errcheck
+          - goconst
+          - gosec
+          - nilerr
+          - staticcheck
+          - unparam
+          - unused
+      - path: ".*data_upload_controller_test.go$"
+        linters:
+          - dupword
+        text: "type"
+      - path: ".*config_test.go$"
+        linters:
+          - dupword
+        text: "bucket"
+
+    generated: lax
+    presets:
+      - comments
+      - common-false-positives
+      - legacy
+      - std-error-handling

 issues:
-  # which dirs to skip: issues from them won't be reported;
-  # can use regexp here: generated.*, regexp is applied on full path;
-  # default value is empty list, but default dirs are skipped independently
-  # from this option's value (see skip-dirs-use-default).
-  # "/" will be replaced by current OS file path separator to properly work
-  # on Windows.
-  exclude-dirs:
-    - pkg/plugin/generated/*
-
-  exclude-rules:
-    - linters:
-        - staticcheck
-      text: "DefaultVolumesToRestic" # No need to report deprecate for DefaultVolumesToRestic.
-    - path: ".*_test.go$"
-      linters:
-        - errcheck
-        - goconst
-        - gosec
-        - govet
-        - staticcheck
-        - stylecheck
-        - unparam
-        - unused
-    - path: test/
-      linters:
-        - errcheck
-        - goconst
-        - gosec
-        - nilerr
-        - staticcheck
-        - stylecheck
-        - unparam
-        - unused
-    - path: ".*data_upload_controller_test.go$"
-      linters:
-        - dupword
-      text: "type"
-    - path: ".*config_test.go$"
-      linters:
-        - dupword
-      text: "bucket"
-
-  # The list of ids of default excludes to include or disable. By default it's empty.
-  include:
-    - EXC0002 # disable excluding of issues about comments from golint
-
  # Maximum issues count per one linter. Set to 0 to disable. Default is 50.
  max-issues-per-linter: 0

@@ -414,20 +401,29 @@ issues:
  # make issues output unique by line, default is true
  uniq-by-line: true

-severity:
-  # Default value is empty string.
-  # Set the default severity for issues. If severity rules are defined and the issues 
-  # do not match or no severity is provided to the rule this will be the default 
-  # severity applied. Severities should match the supported severity names of the 
-  # selected out format.
-  # - Code climate: https://docs.codeclimate.com/docs/issues#issue-severity
-  # -   Checkstyle: https://checkstyle.sourceforge.io/property_types.html#severity
-  # -       Github: https://help.github.com/en/actions/reference/workflow-commands-for-github-actions#setting-an-error-message
-  default-severity: error
+# This file contains all available configuration options
+# with their default values.
+formatters:
+  enable:
+    - gofmt
+    - goimports

-  # The default value is false. 
-  # If set to true severity-rules regular expressions become case sensitive.
-  case-sensitive: false
+  exclusions:
+    generated: lax
+    paths:
+      - pkg/plugin/generated/*
+      - third_party
+
+  settings:
+    gofmt:
+      # simplify code: gofmt with `-s` option, true by default
+      simplify: true
+    goimports:
+      local-prefixes:
+        - github.com/vmware-tanzu/velero
+
+severity:
+  default: error

  # Default value is empty list.
  # When a list of severity rules are provided, severity information will be added to lint
@@ -436,5 +432,7 @@ severity:
  # Only affects out formats that support setting severity information.
  rules:
    - linters:
-      - dupl
+        - dupl
      severity: info
+
+version: "2"
--- a/.goreleaser.yml
+++ b/.goreleaser.yml
@@ -26,18 +26,23 @@ builds:
      - arm
      - arm64
      - ppc64le
+      - s390x
    ignore:
      # don't build arm for darwin and arm/arm64 for windows
      - goos: darwin
        goarch: arm
      - goos: darwin
        goarch: ppc64le
+      - goos: darwin
+        goarch: s390x
      - goos: windows
        goarch: arm
      - goos: windows
        goarch: arm64
      - goos: windows
        goarch: ppc64le
+      - goos: windows
+        goarch: s390x
    ldflags:
      - -X "github.com/vmware-tanzu/velero/pkg/buildinfo.Version={{ .Tag }}" -X "github.com/vmware-tanzu/velero/pkg/buildinfo.GitSHA={{ .FullCommit }}" -X "github.com/vmware-tanzu/velero/pkg/buildinfo.GitTreeState={{ .Env.GIT_TREE_STATE }}" -X "github.com/vmware-tanzu/velero/pkg/buildinfo.ImageRegistry={{ .Env.REGISTRY }}"
 archives:
@@ -60,4 +65,4 @@ git:
  # tags if there are more than one tag in the same commit.
  #
  # Default: `-version:refname`
-  tag_sort: -version:creatordate
+  tag_sort: -version:creatordate
--- a/7
+++ b/7
@@ -13,7 +13,7 @@
 # limitations under the License.

 # Velero binary build section
-FROM --platform=$BUILDPLATFORM golang:1.23.11-bookworm AS velero-builder
+FROM --platform=$BUILDPLATFORM golang:1.25-bookworm AS velero-builder

 ARG GOPROXY
 ARG BIN
@@ -49,7 +49,7 @@ RUN mkdir -p /output/usr/bin && \
    go clean -modcache -cache

 # Restic binary build section
-FROM --platform=$BUILDPLATFORM golang:1.23.11-bookworm AS restic-builder
+FROM --platform=$BUILDPLATFORM golang:1.25-bookworm AS restic-builder

 ARG GOPROXY
 ARG BIN
@@ -73,7 +73,7 @@ RUN mkdir -p /output/usr/bin && \
    go clean -modcache -cache

 # Velero image packing section
-FROM paketobuildpacks/run-jammy-tiny:0.2.73
+FROM paketobuildpacks/run-jammy-tiny:latest

 LABEL maintainer="Xun Jiang <jxun@vmware.com>"

@@ -82,3 +82,4 @@ COPY --from=velero-builder /output /
 COPY --from=restic-builder /output /

 USER cnb:cnb
+
--- a/4
+++ b/4
@@ -15,7 +15,7 @@
 ARG OS_VERSION=1809

 # Velero binary build section
-FROM --platform=$BUILDPLATFORM golang:1.23.10-bookworm AS velero-builder
+FROM --platform=$BUILDPLATFORM golang:1.25-bookworm AS velero-builder

 ARG GOPROXY
 ARG BIN
@@ -44,6 +44,8 @@ RUN mkdir -p /output/usr/bin && \
    export GOARM=$( echo "${GOARM}" | cut -c2-) && \
    go build -o /output/${BIN}.exe \
    -ldflags "${LDFLAGS}" ${PKG}/cmd/${BIN} && \
+    go build -o /output/velero-restore-helper.exe \
+    -ldflags "${LDFLAGS}" ${PKG}/cmd/velero-restore-helper && \    
    go build -o /output/velero-helper.exe \
    -ldflags "${LDFLAGS}" ${PKG}/cmd/velero-helper && \
    go clean -modcache -cache
--- a/8
+++ b/8
@@ -65,7 +65,7 @@ endif
 BUILDER_IMAGE := $(REGISTRY)/build-image:$(BUILDER_IMAGE_TAG)
 BUILDER_IMAGE_CACHED := $(shell docker images -q ${BUILDER_IMAGE} 2>/dev/null )

-HUGO_IMAGE := hugo-builder
+HUGO_IMAGE := ghcr.io/gohugoio/hugo

 # Which architecture to build - see $(ALL_ARCH) for options.
 # if the 'local' rule is being run, detect the ARCH from 'go env'
@@ -108,7 +108,7 @@ comma=,
 # The version of restic binary to be downloaded
 RESTIC_VERSION ?= 0.15.0

-CLI_PLATFORMS ?= linux-amd64 linux-arm linux-arm64 darwin-amd64 darwin-arm64 windows-amd64 linux-ppc64le
+CLI_PLATFORMS ?= linux-amd64 linux-arm linux-arm64 darwin-amd64 darwin-arm64 windows-amd64 linux-ppc64le linux-s390x
 BUILD_OUTPUT_TYPE ?= docker
 BUILD_OS ?= linux
 BUILD_ARCH ?= amd64
@@ -451,7 +451,7 @@ release:
 serve-docs: build-image-hugo
 	docker run \
 	--rm \
-	-v "$$(pwd)/site:/srv/hugo" \
+	-v "$$(pwd)/site:/project" \
 	-it -p 1313:1313 \
 	$(HUGO_IMAGE) \
 	server --bind=0.0.0.0 --enableGitInfo=false
@@ -487,4 +487,4 @@ new-changelog:
 	fi
 	@mkdir -p ./changelogs/unreleased/ && \
 	echo $(CHANGELOG_BODY) > ./changelogs/unreleased/$(GH_PR_NUMBER)-$(GH_LOGIN) && \
-	echo \"$(CHANGELOG_BODY)\" added to "./changelogs/unreleased/$(GH_PR_NUMBER)-$(GH_LOGIN)"
+	echo \"$(CHANGELOG_BODY)\" added to "./changelogs/unreleased/$(GH_PR_NUMBER)-$(GH_LOGIN)"
--- a/README.md
+++ b/README.md
@@ -42,6 +42,7 @@ The following is a list of the supported Kubernetes versions for each Velero ver

 | Velero version | Expected Kubernetes version compatibility | Tested on Kubernetes version        |
 |----------------|-------------------------------------------|-------------------------------------|
+| 1.17           | 1.18-latest                               | 1.31.7, 1.32.3, 1.33.1, and 1.34.0          |
 | 1.16           | 1.18-latest                               | 1.31.4, 1.32.3, and 1.33.0          |
 | 1.15           | 1.18-latest                               | 1.28.8, 1.29.8, 1.30.4 and 1.31.1   |
 | 1.14           | 1.18-latest                               | 1.27.9, 1.28.9, and 1.29.4          |
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -12,13 +12,13 @@ The Velero project maintains the following [governance document](https://github.

 Security is of the highest importance and all security vulnerabilities or suspected security vulnerabilities should be reported to Velero privately, to minimize attacks against current users of Velero before they are fixed. Vulnerabilities will be investigated and patched on the next patch (or minor) release as soon as possible. This information could be kept entirely internal to the project.  

-If you know of a publicly disclosed security vulnerability for Velero, please **IMMEDIATELY** contact the VMware Security Team (security@vmware.com).
+If you know of a publicly disclosed security vulnerability for Velero, please **IMMEDIATELY** contact the Security Team (velero-security.pdl@broadcom.com).

 

 **IMPORTANT: Do not file public issues on GitHub for security vulnerabilities**

-To report a vulnerability or a security-related issue, please contact the VMware email address with the details of the vulnerability. The email will be fielded by the VMware Security Team and then shared with the Velero maintainers who have committer and release permissions. Emails will be addressed within 3 business days, including a detailed plan to investigate the issue and any potential workarounds to perform in the meantime. Do not report non-security-impacting bugs through this channel. Use [GitHub issues](https://github.com/vmware-tanzu/velero/issues/new/choose) instead.
+To report a vulnerability or a security-related issue, please contact the email address with the details of the vulnerability. The email will be fielded by the Security Team and then shared with the Velero maintainers who have committer and release permissions. Emails will be addressed within 3 business days, including a detailed plan to investigate the issue and any potential workarounds to perform in the meantime. Do not report non-security-impacting bugs through this channel. Use [GitHub issues](https://github.com/vmware-tanzu/velero/issues/new/choose) instead.


 ## Proposed Email Content
@@ -29,7 +29,7 @@ Provide a descriptive subject line and in the body of the email include the foll

 *   Basic identity information, such as your name and your affiliation or company.
 *   Detailed steps to reproduce the vulnerability  (POC scripts, screenshots, and logs are all helpful to us).
-*   Description of the effects of the vulnerability on Velero and the related hardware and software configurations, so that the VMware Security Team can reproduce it.
+*   Description of the effects of the vulnerability on Velero and the related hardware and software configurations, so that the Security Team can reproduce it.
 *   How the vulnerability affects Velero usage and an estimation of the attack surface, if there is one.
 *   List other projects or dependencies that were used in conjunction with Velero to produce the vulnerability.

@@ -49,7 +49,7 @@ Provide a descriptive subject line and in the body of the email include the foll

 ## Patch, Release, and Disclosure

-The VMware Security Team will respond to vulnerability reports as follows:
+The Security Team will respond to vulnerability reports as follows:

 

@@ -62,7 +62,7 @@ The VMware Security Team will respond to vulnerability reports as follows:
 5. The Security Team will also create a [CVSS](https://www.first.org/cvss/specification-document) using the [CVSS Calculator](https://www.first.org/cvss/calculator/3.0). The Security Team makes the final call on the calculated CVSS; it is better to move quickly than making the CVSS perfect. Issues may also be reported to [Mitre](https://cve.mitre.org/) using this [scoring calculator](https://nvd.nist.gov/vuln-metrics/cvss/v3-calculator). The CVE will initially be set to private.
 6. The Security Team will work on fixing the vulnerability and perform internal testing before preparing to roll out the fix.
 7. The Security Team will provide early disclosure of the vulnerability by emailing the [Velero Distributors](https://groups.google.com/u/1/g/projectvelero-distributors) mailing list. Distributors can initially plan for the vulnerability patch ahead of the fix, and later can test the fix and provide feedback to the Velero team. See the section **Early Disclosure to Velero Distributors List** for details about how to join this mailing list. 
-8. A public disclosure date is negotiated by the VMware SecurityTeam, the bug submitter, and the distributors list. We prefer to fully disclose the bug as soon as possible once a user mitigation or patch is available. It is reasonable to delay disclosure when the bug or the fix is not yet fully understood, the solution is not well-tested, or for distributor coordination. The timeframe for disclosure is from immediate (especially if it’s already publicly known) to a few weeks. For a critical vulnerability with a straightforward mitigation, we expect the report date for the public disclosure date to be on the order of 14 business days. The VMware Security Team holds the final say when setting a public disclosure date.
+8. A public disclosure date is negotiated by the SecurityTeam, the bug submitter, and the distributors list. We prefer to fully disclose the bug as soon as possible once a user mitigation or patch is available. It is reasonable to delay disclosure when the bug or the fix is not yet fully understood, the solution is not well-tested, or for distributor coordination. The timeframe for disclosure is from immediate (especially if it’s already publicly known) to a few weeks. For a critical vulnerability with a straightforward mitigation, we expect the report date for the public disclosure date to be on the order of 14 business days. The Security Team holds the final say when setting a public disclosure date.
 9. Once the fix is confirmed, the Security Team will patch the vulnerability in the next patch or minor release, and backport a patch release into all earlier supported releases. Upon release of the patched version of Velero, we will follow the **Public Disclosure Process**.


@@ -79,7 +79,7 @@ The Security Team will also publish any mitigating steps users can take until th



-*   Use security@vmware.com to report security concerns to the VMware Security Team, who uses the list to privately discuss security issues and fixes prior to disclosure.
+*   Use velero-security.pdl@broadcom.com to report security concerns to the Security Team, who uses the list to privately discuss security issues and fixes prior to disclosure.
 *   Join the [Velero Distributors](https://groups.google.com/u/1/g/projectvelero-distributors) mailing list for early private information and vulnerability disclosure. Early disclosure may include mitigating steps and additional information on security patch releases. See below for information on how Velero distributors or vendors can apply to join this list.


@@ -107,11 +107,11 @@ To be eligible to join the [Velero Distributors](https://groups.google.com/u/1/g

 ## Embargo Policy

-The information that members receive on the Velero Distributors mailing list must not be made public, shared, or even hinted at anywhere beyond those who need to know within your specific team, unless you receive explicit approval to do so from the VMware Security Team. This remains true until the public disclosure date/time agreed upon by the list. Members of the list and others cannot use the information for any reason other than to get the issue fixed for your respective distribution's users.
+The information that members receive on the Velero Distributors mailing list must not be made public, shared, or even hinted at anywhere beyond those who need to know within your specific team, unless you receive explicit approval to do so from the Security Team. This remains true until the public disclosure date/time agreed upon by the list. Members of the list and others cannot use the information for any reason other than to get the issue fixed for your respective distribution's users.

 Before you share any information from the list with members of your team who are required to fix the issue, these team members must agree to the same terms, and only be provided with information on a need-to-know basis.

-In the unfortunate event that you share information beyond what is permitted by this policy, you must urgently inform the VMware Security Team (security@vmware.com) of exactly what information was leaked and to whom. If you continue to leak information and break the policy outlined here, you will be permanently removed from the list.
+In the unfortunate event that you share information beyond what is permitted by this policy, you must urgently inform the Security Team (velero-security.pdl@broadcom.com) of exactly what information was leaked and to whom. If you continue to leak information and break the policy outlined here, you will be permanently removed from the list.

 

@@ -123,6 +123,6 @@ Send new membership requests to projectvelero-distributors@googlegroups.com. In

 ## Confidentiality, integrity and availability

-We consider vulnerabilities leading to the compromise of data confidentiality, elevation of privilege, or integrity to be our highest priority concerns. Availability, in particular in areas relating to DoS and resource exhaustion, is also a serious security concern. The VMware Security Team takes all vulnerabilities, potential vulnerabilities, and suspected vulnerabilities seriously and will investigate them in an urgent and expeditious manner.
+We consider vulnerabilities leading to the compromise of data confidentiality, elevation of privilege, or integrity to be our highest priority concerns. Availability, in particular in areas relating to DoS and resource exhaustion, is also a serious security concern. The Security Team takes all vulnerabilities, potential vulnerabilities, and suspected vulnerabilities seriously and will investigate them in an urgent and expeditious manner.

 Note that we do not currently consider the default settings for Velero to be secure-by-default. It is necessary for operators to explicitly configure settings, role based access control, and other resource related features in Velero to provide a hardened Velero environment. We will not act on any security disclosure that relates to a lack of safe defaults. Over time, we will work towards improved safe-by-default configuration, taking into account backwards compatibility.
--- a/2
+++ b/2
@@ -52,7 +52,7 @@ git_sha = str(local("git rev-parse HEAD", quiet = True, echo_off = True)).strip(

 tilt_helper_dockerfile_header = """
 # Tilt image
-FROM golang:1.23.11 as tilt-helper
+FROM golang:1.25 as tilt-helper

 # Support live reloading with Tilt
 RUN wget --output-document /restart.sh --quiet https://raw.githubusercontent.com/windmilleng/rerun-process-wrapper/master/restart.sh  && \
--- a/changelogs/CHANGELOG-1.16.md
+++ b/changelogs/CHANGELOG-1.16.md
@@ -1,48 +1,3 @@
-## v1.16.2
-
-### Download
-https://github.com/vmware-tanzu/velero/releases/tag/v1.16.2
-
-### Container Image
-`velero/velero:v1.16.2`
-
-### Documentation
-https://velero.io/docs/v1.16/
-
-### Upgrading
-https://velero.io/docs/v1.16/upgrade-to-1.16/
-
-### All Changes
-  * Update "Default Volumes to Fs Backup" to "File System Backup (Default)" (#9105, @shubham-pampattiwar)
-  * Fix missing defaultVolumesToFsBackup flag output in Velero describe backup cmd (#9103, @shubham-pampattiwar)
-  * Add imagePullSecrets inheritance for VGDP pod and maintenance job. (#9102, @blackpiglet)
-  * Fix issue #9077, don't block backup deletion on list VS error (#9101, @Lyndon-Li)
-  * Mounted cloud credentials should not be world-readable (#9094, @sseago)
-  * Allow for proper tracking of multiple hooks per container (#9060, @sseago)
-  * Add BSL status check for backup/restore operations. (#9010, @blackpiglet)
-
-
-## v1.16.1
-
-### Download
-https://github.com/vmware-tanzu/velero/releases/tag/v1.16.1
-
-### Container Image
-`velero/velero:v1.16.1`
-
-### Documentation
-https://velero.io/docs/v1.16/
-
-### Upgrading
-https://velero.io/docs/v1.16/upgrade-to-1.16/
-
-### All Changes
-  * Call WaitGroup.Done() once only when PVB changes to final status the first time to avoid panic (#8940, @ywk253100)
-  * Add VolumeSnapshotContent into the RIA and the mustHave resource list. (#8926, @blackpiglet)
-  * Warn for not found error in patching managed fields (#8916, @sseago)
-  * Fix issue 8878, relief node os deduction error checks (#8911, @Lyndon-Li)
-
-
 ## v1.16

 ### Download
--- a/changelogs/CHANGELOG-1.17.md
+++ b/changelogs/CHANGELOG-1.17.md
@@ -0,0 +1,143 @@
+## v1.17
+
+### Download
+https://github.com/vmware-tanzu/velero/releases/tag/v1.17.0
+
+### Container Image
+`velero/velero:v1.17.0`
+
+### Documentation
+https://velero.io/docs/v1.17/
+
+### Upgrading
+https://velero.io/docs/v1.17/upgrade-to-1.17/
+
+### Highlights
+#### Modernized fs-backup
+In v1.17, Velero fs-backup is modernized to the micro-service architecture, which brings below benefits:  
+- Many features that were absent to fs-backup are now available, i.e., load concurrency control, cancel, resume on restart, etc.
+- fs-backup is more robust, the running backup/restore could survive from node-agent restart; and the resource allocation is in a more granular manner, the failure of one backup/restore won't impact others.  
+- The resource usage of node-agent is steady, especially, the node-agent pods won't request huge memory and hold it for a long time.  
+
+Check design https://github.com/vmware-tanzu/velero/blob/main/design/vgdp-micro-service-for-fs-backup/vgdp-micro-service-for-fs-backup.md for more details.  
+
+#### fs-backup support Windows cluster
+In v1.17, Velero fs-backup supports to backup/restore Windows workloads. By leveraging the new micro-service architecture for fs-backup, data mover pods could run in Windows nodes and backup/restore Windows volumes. Together with CSI snapshot data movement for Windows which is delivered in 1.16, Velero now supports Windows workload backup/restore in full scenarios.  
+Check design https://github.com/vmware-tanzu/velero/blob/main/design/vgdp-micro-service-for-fs-backup/vgdp-micro-service-for-fs-backup.md for more details.  
+
+#### Volume group snapshot support
+In v1.17, Velero supports [volume group snapshots](https://kubernetes.io/blog/2024/12/18/kubernetes-1-32-volume-group-snapshot-beta/) which is a beta feature in Kubernetes upstream, for both CSI snapshot backup and CSI snapshot data movement. This allows a snapshot to be taken from multiple volumes at the same point-in-time to achieve write order consistency, which is helpful to achieve better data consistency when multiple volumes being backed up are correlated.  
+Check the document https://velero.io/docs/main/volume-group-snapshots/ for more details.  
+
+#### Priority class support
+In v1.17, [Kubernetes priority class](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass) is supported for all modules across Velero. Specifically, users are allowed to configure priority class to Velero server, node-agent, data mover pods, backup repository maintenance jobs separately.  
+Check design https://github.com/vmware-tanzu/velero/blob/main/design/Implemented/priority-class-name-support_design.md for more details.  
+
+#### Scalability and Resiliency improvements of data movers
+##### Reduce excessive number of data mover pods in Pending state
+In v1.17, Velero allows users to set a `PrepareQueueLength` in the node-agent configuration, data mover pods and volumes out of this number won't be created until data path quota is available, so that excessive number cluster resources won't  be taken unnecessarily, which is particularly helpful for large scale environments. This improvement applies to all kinds of data movements, including fs-backup and CSI snapshot data movement.  
+Check design https://github.com/vmware-tanzu/velero/blob/main/design/node-agent-load-soothing.md for more details.  
+
+##### Enhancement on node-agent restart handling for data movements
+In v1.17, data movements in all phases could survive from node-agent restart and resume themselves; when a data movement gets orphaned in special cases, e.g., cluster node absent, it could also be canceled appropriately after the restart. This improvement applies to all kinds of data movements, including fs-backup and CSI snapshot data movement.  
+Check issue https://github.com/vmware-tanzu/velero/issues/8534 for more details.  
+
+##### CSI snapshot data movement restore node-selection and node-selection by storage class
+In v1.17, CSI snapshot data movement restore acquires the same node-selection capability as backup, that is, users could specify which nodes can/cannot run data mover pods for both backup and restore now. And users are also allowed to configure the node-selection per storage class, which is particularly helpful to the environments where a storage class are not usable by all cluster nodes.  
+Check issue https://github.com/vmware-tanzu/velero/issues/8186 and https://github.com/vmware-tanzu/velero/issues/8223 for more details.  
+
+#### Include/exclude policy support for resource policy
+In v1.17, Velero resource policy supports `includeExcludePolicy` besides the existing `volumePolicy`. This allows users to set include/exclude filters for resources in a resource policy configmap, so that these filters are reusable among multiple backups.  
+Check the document https://velero.io/docs/main/resource-filtering/#creating-resource-policies:~:text=resources%3D%22*%22-,Resource%20policies,-Velero%20provides%20resource for more details.  
+
+### Runtime and dependencies
+Golang runtime: 1.24.6  
+kopia: 0.21.1  
+
+### Limitations/Known issues
+
+### Breaking changes
+#### Deprecation of Restic
+According to [Velero deprecation policy](https://github.com/vmware-tanzu/velero/blob/main/GOVERNANCE.md#deprecation-policy), backup of fs-backup under Restic path is removed in v1.17, so `--uploader-type=restic` is not a valid installation configuration anymore. This means you cannot create a backup under Restic path, but you can still restore from the previous backups under Restic path until v1.19.  
+
+#### Repository maintenance job configurations are removed from Velero server parameter
+Since the repository maintenance job configurations are moved to repository maintenance job configMap, in v1.17 below Velero sever parameters are removed:
+- --keep-latest-maintenance-jobs
+- --maintenance-job-cpu-request
+- --maintenance-job-mem-request
+- --maintenance-job-cpu-limit
+- --maintenance-job-mem-limit
+
+### All Changes
+  * Add ConfigMap parameters validation for install CLI and server start. (#9200, @blackpiglet)
+  * Add priorityclasses to high priority restore list (#9175, @kaovilai)
+  * Introduced context-based logger for backend implementations (Azure, GCS, S3, and Filesystem) (#9168, @priyansh17)
+  * Fix issue #9140, add os=windows:NoSchedule toleration for Windows pods (#9165, @Lyndon-Li)
+  * Remove the repository maintenance job parameters from velero server. (#9147, @blackpiglet)
+  * Add include/exclude policy to resources policy (#9145, @reasonerjt)
+  * Add ConfigMap support for keepLatestMaintenanceJobs with CLI parameter fallback (#9135, @shubham-pampattiwar)
+  * Fix the dd and du's node affinity issue. (#9130, @blackpiglet)
+  * Remove the WaitUntilVSCHandleIsReady from vs BIA. (#9124, @blackpiglet)
+  * Add comprehensive Volume Group Snapshots documentation with workflow diagrams and examples (#9123, @shubham-pampattiwar)
+  * Fix issue #9065, add doc for node-agent prepare queue length (#9118, @Lyndon-Li)
+  * Fix issue #9095, update restore doc for PVC selected-node (#9117, @Lyndon-Li)
+  * Update CSI Snapshot Data Movement doc for issue #8534, #8185 (#9113, @Lyndon-Li)
+  * Fix issue #8986, refactor fs-backup doc after VGDP Micro Service for fs-backup (#9112, @Lyndon-Li)
+  * Return error if timeout when checking server version (#9111, @ywk253100)
+  * Update "Default Volumes to Fs Backup" to "File System Backup (Default)" (#9105, @shubham-pampattiwar)
+  * Fix issue #9077, don't block backup deletion on list VS error (#9100, @Lyndon-Li)
+  * Bump up Kopia to v0.21.1 (#9098, @Lyndon-Li)
+  * Add imagePullSecrets inheritance for VGDP pod and maintenance job. (#9096, @blackpiglet)
+  * Avoid checking the VS and VSC status in the backup finalizing phase. (#9092, @blackpiglet)
+  * Fix issue #9053, Always remove selected-node annotation during PVC restore when no node mapping exists. Breaking change: Previously, the annotation was preserved if the node existed. (#9076, @Lyndon-Li)
+  * Enable parameterized kubelet mount path during node-agent installation (#9074, @longxiucai)
+  * Fix issue #8857, support third party tolerations for data mover pods (#9072, @Lyndon-Li)
+  * Fix issue #8813, remove restic from the valid uploader type (#9069, @Lyndon-Li)
+  * Fix issue #8185, allow users to disable pod volume host path mount for node-agent (#9068, @Lyndon-Li)
+  * Fix #8344, add the design for a mechanism to soothe creation of data mover pods for DataUpload, DataDownload, PodVolumeBackup and PodVolumeRestore (#9067, @Lyndon-Li)
+  * Fix #8344, add a mechanism to soothe creation of data mover pods for DataUpload, DataDownload, PodVolumeBackup and PodVolumeRestore (#9064, @Lyndon-Li)
+  * Add Gauge metric for BSL availability (#9059, @reasonerjt)
+  * Fix missing defaultVolumesToFsBackup flag output in Velero describe backup cmd (#9056, @shubham-pampattiwar)
+  * Allow for proper tracking of multiple hooks per container (#9048, @sseago)
+  * Make the backup repository controller doesn't invalidate the BSL on restart (#9046, @blackpiglet)
+  * Removed username/password credential handling from newConfigCredential as azidentity.UsernamePasswordCredentialOptions is reported as deprecated. (#9041, @priyansh17)
+  * Remove dependency with VolumeSnapshotClass in DataUpload. (#9040, @blackpiglet)
+  * Fix issue #8961, cancel PVB/PVR on Velero server restart (#9031, @Lyndon-Li)
+  * Fix issue #8962, resume PVB/PVR during node-agent restarts (#9030, @Lyndon-Li)
+  * Bump kopia v0.20.1 (#9027, @Lyndon-Li)
+  * Fix issue #8965, support PVB/PVR's cancel state in the backup/restore (#9026, @Lyndon-Li)
+  * Fix Issue 8816 When specifying LabelSelector on restore, related items such as PVC and VolumeSnapshot are not included (#9024, @amastbau)
+  * Fix issue #8963, add legacy PVR controller for Restic path (#9022, @Lyndon-Li)
+  * Fix issue #8964, add Windows support for VGDP MS for fs-backup (#9021, @Lyndon-Li)
+  * Accommodate VGS workflows in PVC CSI plugin (#9019, @shubham-pampattiwar)
+  * Fix issue #8958, add VGDP MS PVB controller (#9015, @Lyndon-Li)
+  * Fix issue #8959, add VGDP MS PVR controller (#9014, @Lyndon-Li)
+  * Fix issue #8988, add data path for VGDP ms PVR (#9005, @Lyndon-Li)
+  * Fix issue #8988, add data path for VGDP ms pvb (#8998, @Lyndon-Li)
+  * Skip VS and VSC not created by backup. (#8990, @blackpiglet)
+  * Make ResticIdentifier optional for kopia BackupRepositories (#8987, @kaovilai)
+  * Fix issue #8960, implement PodVolume exposer for PVB/PVR (#8985, @Lyndon-Li)
+  * fix: update mc command in minio-deployment example (#8982, @vishal-chdhry)
+  * Fix issue #8957, add design for VGDP MS for fs-backup (#8979, @Lyndon-Li)
+  * Add BSL status check for backup/restore operations. (#8976, @blackpiglet)
+  * Mark BackupRepository not ready when BSL changed (#8975, @ywk253100)
+  * Add support for [distributed snapshotting](https://github.com/kubernetes-csi/external-snapshotter/tree/4cedb3f45790ac593ebfa3324c490abedf739477?tab=readme-ov-file#distributed-snapshotting) (#8969, @flx5)
+  * Fix issue #8534, refactor dm controllers to tolerate cancel request in more cases, e.g., node restart, node drain (#8952, @Lyndon-Li)
+  * The backup and restore VGDP affinity enhancement implementation. (#8949, @blackpiglet)
+  * Remove CSI VS and VSC metadata from backup. (#8946, @blackpiglet)
+  * Extend PVCAction itemblock plugin to support grouping PVCs under VGS label key (#8944, @shubham-pampattiwar)
+  * Copy security context from origin pod (#8943, @farodin91)
+  * Add support for configuring VGS label key (#8938, @shubham-pampattiwar)
+  * Add VolumeSnapshotContent into the RIA and the mustHave resource list. (#8924, @blackpiglet)
+  * Mounted cloud credentials should not be world-readable (#8919, @sseago)
+  * Warn for not found error in patching managed fields (#8902, @sseago)
+  * Fix issue 8878, relief node os deduction error checks (#8891, @Lyndon-Li)
+  * Skip namespace in terminating state in backup resource collection. (#8890, @blackpiglet)
+  * Implement PriorityClass Support (#8883, @kaovilai)
+  * Fix Velero adding restore-wait init container when not needed. (#8880, @kaovilai)
+  * Pass the logger in kopia related operations. (#8875, @hu-keyu)
+  * Inherit the dnsPolicy and dnsConfig from the node agent pod. This is done so that the kopia task uses the same configuration. (#8845, @flx5)
+  * Add design for VolumeGroupSnapshot support (#8778, @shubham-pampattiwar)
+  * Inherit k8s default volumeSnapshotClass. (#8719, @hu-keyu)
+  * CLI automatically discovers and uses cacert from BSL for download requests (#8557, @kaovilai)
+  * This PR aims to add s390x support to Velero binary. (#7505, @pandurangkhandeparker)
--- a/changelogs/unreleased/9132-mjnagel
+++ b/changelogs/unreleased/9132-mjnagel
@@ -0,0 +1 @@
+Add `--apply` flag to `install` command, allowing usage of Kubernetes apply to make changes to existing installs
--- a/changelogs/unreleased/9141-kaovilai
+++ b/changelogs/unreleased/9141-kaovilai
@@ -0,0 +1 @@
+feat: Enhance BackupStorageLocation with Secret-based CA certificate support
--- a/changelogs/unreleased/9148-Lyndon-Li
+++ b/changelogs/unreleased/9148-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #7725, add design for backup repo cache configuration
--- a/changelogs/unreleased/9166-claude
+++ b/changelogs/unreleased/9166-claude
@@ -0,0 +1 @@
+Add VolumePolicy support for PVC Phase conditions to allow skipping Pending PVCs
--- a/changelogs/unreleased/9173-clementnuss
+++ b/changelogs/unreleased/9173-clementnuss
@@ -0,0 +1 @@
+feat: Permit specifying annotations for the BackupPVC
--- a/changelogs/unreleased/9206-Joeavaikath
+++ b/changelogs/unreleased/9206-Joeavaikath
@@ -0,0 +1 @@
+Remove labels associated with previous backups
--- a/changelogs/unreleased/9226-sseago
+++ b/changelogs/unreleased/9226-sseago
@@ -0,0 +1 @@
+Get pod list once per namespace in pvc IBA
--- a/changelogs/unreleased/9233-Lyndon-Li
+++ b/changelogs/unreleased/9233-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #9229, don't attach backupPVC to the source node
--- a/changelogs/unreleased/9248-0xLeo258
+++ b/changelogs/unreleased/9248-0xLeo258
@@ -0,0 +1 @@
+Protect VolumeSnapshot field from race condition during multi-thread backup
--- a/changelogs/unreleased/9255-Joeavaikath
+++ b/changelogs/unreleased/9255-Joeavaikath
@@ -0,0 +1,10 @@
+Implement wildcard namespace pattern expansion for backup namespace includes/excludes.
+
+This change adds support for wildcard patterns (*, ?, [abc], {a,b,c}) in namespace includes and excludes during backup operations. 
+When wildcard patterns are detected, they are expanded against the list of active namespaces in the cluster before the backup proceeds.
+
+Key features:
+- Wildcard patterns in namespace includes/excludes are automatically detected and expanded
+- Pattern validation ensures unsupported patterns (regex, consecutive asterisks) are rejected
+- Empty wildcard results (e.g., "invalid*" matching no namespaces) correctly result in empty backups
+- Exact namespace names and "*" continue to work as before (no expansion needed)
--- a/changelogs/unreleased/9256-shubham-pampattiwar
+++ b/changelogs/unreleased/9256-shubham-pampattiwar
@@ -0,0 +1 @@
+Fix repository maintenance jobs to inherit allowlisted tolerations from Velero deployment
--- a/changelogs/unreleased/9261-priyansh17
+++ b/changelogs/unreleased/9261-priyansh17
@@ -1 +0,0 @@
-Backport to 1.16 (PR#9244 Update AzureAD Microsoft Authentication Library to v1.5.0)
--- a/changelogs/unreleased/9264-shubham-pampattiwar
+++ b/changelogs/unreleased/9264-shubham-pampattiwar
@@ -0,0 +1 @@
+Fix schedule controller to prevent backup queue accumulation during extended blocking scenarios by properly handling empty backup phases
--- a/changelogs/unreleased/9269-Lyndon-Li
+++ b/changelogs/unreleased/9269-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #7904, remove the code and doc for PVC node selection
--- a/changelogs/unreleased/9281-0xLeo258
+++ b/changelogs/unreleased/9281-0xLeo258
@@ -0,0 +1 @@
+Implement concurrency control for cache of native VolumeSnapshotter plugin.
--- a/changelogs/unreleased/9291-Lyndon-Li
+++ b/changelogs/unreleased/9291-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #9193, don't connect repo in repo controller
--- a/changelogs/unreleased/9295-sseago
+++ b/changelogs/unreleased/9295-sseago
@@ -0,0 +1 @@
+Add option for privileged fs-backup pod
--- a/changelogs/unreleased/9296-Lyndon-Li
+++ b/changelogs/unreleased/9296-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #9267, add events to data mover prepare diagnostic
--- a/changelogs/unreleased/9302-blackpiglet
+++ b/changelogs/unreleased/9302-blackpiglet
@@ -0,0 +1 @@
+VerifyJSONConfigs verify every elements in Data.
--- a/changelogs/unreleased/9307-sseago
+++ b/changelogs/unreleased/9307-sseago
@@ -0,0 +1 @@
+Concurrent backup processing
--- a/changelogs/unreleased/9321-shubham-pampattiwar
+++ b/changelogs/unreleased/9321-shubham-pampattiwar
@@ -0,0 +1 @@
+Sanitize Azure HTTP responses in BSL status messages
--- a/changelogs/unreleased/9329-T4iFooN-IX
+++ b/changelogs/unreleased/9329-T4iFooN-IX
@@ -0,0 +1 @@
+Fix typos in documentation
--- a/changelogs/unreleased/9333-Lyndon-Li
+++ b/changelogs/unreleased/9333-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #9332, add bytesDone for cache files
--- a/changelogs/unreleased/9342-Lyndon-Li
+++ b/changelogs/unreleased/9342-Lyndon-Li
@@ -0,0 +1 @@
+Add cache configuration to VGDP
--- a/changelogs/unreleased/9350-blackpiglet
+++ b/changelogs/unreleased/9350-blackpiglet
@@ -0,0 +1 @@
+Fix the Job build error when BackupReposiotry name longer than 63.
--- a/changelogs/unreleased/9353-Lyndon-Li
+++ b/changelogs/unreleased/9353-Lyndon-Li
@@ -0,0 +1 @@
+Add cache dir configuration for udmrepo
--- a/changelogs/unreleased/9354-Lyndon-Li
+++ b/changelogs/unreleased/9354-Lyndon-Li
@@ -0,0 +1 @@
+Add snapshotSize for DataDownload, PodVolumeRestore
--- a/changelogs/unreleased/9357-sseago
+++ b/changelogs/unreleased/9357-sseago
@@ -0,0 +1 @@
+Add incrementalSize to DU/PVB for reporting new/changed size
--- a/changelogs/unreleased/9362-Lyndon-Li
+++ b/changelogs/unreleased/9362-Lyndon-Li
@@ -0,0 +1 @@
+Support cache volume for generic restore exposer and pod volume exposer
--- a/changelogs/unreleased/9366-blackpiglet
+++ b/changelogs/unreleased/9366-blackpiglet
@@ -0,0 +1 @@
+Use hookIndex for recording multiple restore exec hooks.
--- a/changelogs/unreleased/9367-shubham-pampattiwar
+++ b/changelogs/unreleased/9367-shubham-pampattiwar
@@ -0,0 +1 @@
+Fix managed fields patch for resources using GenerateName
--- a/changelogs/unreleased/9368-shubham-pampattiwar
+++ b/changelogs/unreleased/9368-shubham-pampattiwar
@@ -0,0 +1 @@
+Track actual resource names for GenerateName in restore status
--- a/changelogs/unreleased/9370-Lyndon-Li
+++ b/changelogs/unreleased/9370-Lyndon-Li
@@ -0,0 +1 @@
+Add cache volume configuration
--- a/changelogs/unreleased/9375-Lyndon-Li
+++ b/changelogs/unreleased/9375-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #9365, prevent fake completion notification due to multiple update of single PVR
--- a/changelogs/unreleased/9379-Lyndon-Li
+++ b/changelogs/unreleased/9379-Lyndon-Li
@@ -0,0 +1 @@
+Refactor repo provider interface for static configuration
--- a/changelogs/unreleased/9389-sseago
+++ b/changelogs/unreleased/9389-sseago
@@ -0,0 +1 @@
+don't copy securitycontext from first container if configmap found
--- a/changelogs/unreleased/9391-Lyndon-Li
+++ b/changelogs/unreleased/9391-Lyndon-Li
@@ -0,0 +1 @@
+Cache volume support for DataDownload
--- a/changelogs/unreleased/9397-Lyndon-Li
+++ b/changelogs/unreleased/9397-Lyndon-Li
@@ -0,0 +1 @@
+Cache volume for PVR
--- a/changelogs/unreleased/9407-Lyndon-Li
+++ b/changelogs/unreleased/9407-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #9400, connect repo first time after creation so that init params could be written
--- a/changelogs/unreleased/9414-shubham-pampattiwar
+++ b/changelogs/unreleased/9414-shubham-pampattiwar
@@ -0,0 +1 @@
+Add Prometheus metrics for maintenance jobs
--- a/changelogs/unreleased/9418-Lyndon-Li
+++ b/changelogs/unreleased/9418-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #9276, add doc for cache volume support
--- a/changelogs/unreleased/9419-shubham-pampattiwar
+++ b/changelogs/unreleased/9419-shubham-pampattiwar
@@ -0,0 +1 @@
+Apply volume policies to VolumeGroupSnapshot PVC filtering
--- a/changelogs/unreleased/9420-Lyndon-Li
+++ b/changelogs/unreleased/9420-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #9194, add doc for GOMAXPROCS behavior change
--- a/changelogs/unreleased/9431-blackpiglet
+++ b/changelogs/unreleased/9431-blackpiglet
@@ -0,0 +1 @@
+Remove VolumeSnapshotClass from CSI B/R process.
--- a/changelogs/unreleased/9441-shubham-pampattiwar
+++ b/changelogs/unreleased/9441-shubham-pampattiwar
@@ -0,0 +1 @@
+Add PVC-to-Pod cache to improve volume policy performance
--- a/changelogs/unreleased/9445-mpryc
+++ b/changelogs/unreleased/9445-mpryc
@@ -0,0 +1 @@
+Fix plugin init container names exceeding DNS-1123 limit
--- a/changelogs/unreleased/9452-blackpiglet
+++ b/changelogs/unreleased/9452-blackpiglet
@@ -0,0 +1 @@
+Add maintenance job and data mover pod's labels and annotations setting.
--- a/changelogs/unreleased/9474-blackpiglet
+++ b/changelogs/unreleased/9474-blackpiglet
@@ -0,0 +1 @@
+Add Role, RoleBinding, ClusterRole, and ClusterRoleBinding in restore sequence.
--- a/changelogs/unreleased/9481-Lyndon-Li
+++ b/changelogs/unreleased/9481-Lyndon-Li
@@ -0,0 +1 @@
+Fix issue #9478, add diagnose info on expose peek fails
--- a/config/crd/v1/bases/velero.io_backuprepositories.yaml
+++ b/config/crd/v1/bases/velero.io_backuprepositories.yaml
@@ -71,7 +71,7 @@ spec:
              resticIdentifier:
                description: |-
                  ResticIdentifier is the full restic-compatible string for identifying
-                  this repository.
+                  this repository. This field is only used when RepositoryType is "restic".
                type: string
              volumeNamespace:
                description: |-
@@ -81,7 +81,6 @@ spec:
            required:
            - backupStorageLocation
            - maintenanceFrequency
-            - resticIdentifier
            - volumeNamespace
            type: object
          status:
--- a/config/crd/v1/bases/velero.io_backups.yaml
+++ b/config/crd/v1/bases/velero.io_backups.yaml
@@ -507,6 +507,10 @@ spec:
                      uploads to perform when using the uploader.
                    type: integer
                type: object
+              volumeGroupSnapshotLabelKey:
+                description: VolumeGroupSnapshotLabelKey specifies the label key to
+                  group PVCs under a VGS.
+                type: string
              volumeSnapshotLocations:
                description: VolumeSnapshotLocations is a list containing names of
                  VolumeSnapshotLocations associated with this backup.
@@ -590,6 +594,8 @@ spec:
                description: Phase is the current state of the Backup.
                enum:
                - New
+                - Queued
+                - ReadyToStart
                - FailedValidation
                - InProgress
                - WaitingForPluginOperations
@@ -621,6 +627,11 @@ spec:
                      filters that happen as items are processed.
                    type: integer
                type: object
+              queuePosition:
+                description: |-
+                  QueuePosition is the position of the backup in the queue.
+                  Only relevant when Phase is "Queued"
+                type: integer
              startTimestamp:
                description: |-
                  StartTimestamp records the time a backup was started.
--- a/config/crd/v1/bases/velero.io_backupstoragelocations.yaml
+++ b/config/crd/v1/bases/velero.io_backupstoragelocations.yaml
@@ -113,10 +113,38 @@ spec:
                    description: Bucket is the bucket to use for object storage.
                    type: string
                  caCert:
-                    description: CACert defines a CA bundle to use when verifying
-                      TLS connections to the provider.
+                    description: |-
+                      CACert defines a CA bundle to use when verifying TLS connections to the provider.
+                      Deprecated: Use CACertRef instead.
                    format: byte
                    type: string
+                  caCertRef:
+                    description: |-
+                      CACertRef is a reference to a Secret containing the CA certificate bundle to use
+                      when verifying TLS connections to the provider. The Secret must be in the same
+                      namespace as the BackupStorageLocation.
+                    properties:
+                      key:
+                        description: The key of the secret to select from.  Must be
+                          a valid secret key.
+                        type: string
+                      name:
+                        default: ""
+                        description: |-
+                          Name of the referent.
+                          This field is effectively required, but due to backwards compatibility is
+                          allowed to be empty. Instances of this type with an empty value here are
+                          almost certainly wrong.
+                          More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                        type: string
+                      optional:
+                        description: Specify whether the Secret or its key must be
+                          defined
+                        type: boolean
+                    required:
+                    - key
+                    type: object
+                    x-kubernetes-map-type: atomic
                  prefix:
                    description: Prefix is the path inside a bucket to use for Velero
                      storage. Optional.
--- a/config/crd/v1/bases/velero.io_podvolumebackups.yaml
+++ b/config/crd/v1/bases/velero.io_podvolumebackups.yaml
@@ -15,38 +15,47 @@ spec:
  scope: Namespaced
  versions:
  - additionalPrinterColumns:
-    - description: Pod Volume Backup status such as New/InProgress
+    - description: PodVolumeBackup status such as New/InProgress
      jsonPath: .status.phase
      name: Status
      type: string
-    - description: Time when this backup was started
+    - description: Time duration since this PodVolumeBackup was started
      jsonPath: .status.startTimestamp
-      name: Created
+      name: Started
      type: date
-    - description: Namespace of the pod containing the volume to be backed up
-      jsonPath: .spec.pod.namespace
-      name: Namespace
-      type: string
-    - description: Name of the pod containing the volume to be backed up
-      jsonPath: .spec.pod.name
-      name: Pod
-      type: string
-    - description: Name of the volume to be backed up
-      jsonPath: .spec.volume
-      name: Volume
-      type: string
-    - description: The type of the uploader to handle data transfer
-      jsonPath: .spec.uploaderType
-      name: Uploader Type
-      type: string
+    - description: Completed bytes
+      format: int64
+      jsonPath: .status.progress.bytesDone
+      name: Bytes Done
+      type: integer
+    - description: Total bytes
+      format: int64
+      jsonPath: .status.progress.totalBytes
+      name: Total Bytes
+      type: integer
+    - description: Incremental bytes
+      format: int64
+      jsonPath: .status.incrementalBytes
+      name: Incremental Bytes
+      priority: 10
+      type: integer
    - description: Name of the Backup Storage Location where this backup should be
        stored
      jsonPath: .spec.backupStorageLocation
      name: Storage Location
      type: string
-    - jsonPath: .metadata.creationTimestamp
+    - description: Time duration since this PodVolumeBackup was created
+      jsonPath: .metadata.creationTimestamp
      name: Age
      type: date
+    - description: Name of the node where the PodVolumeBackup is processed
+      jsonPath: .status.node
+      name: Node
+      type: string
+    - description: The type of the uploader to handle data transfer
+      jsonPath: .spec.uploaderType
+      name: Uploader
+      type: string
    name: v1
    schema:
      openAPIV3Schema:
@@ -76,6 +85,11 @@ spec:
                  BackupStorageLocation is the name of the backup storage location
                  where the backup repository is stored.
                type: string
+              cancel:
+                description: |-
+                  Cancel indicates request to cancel the ongoing PodVolumeBackup. It can be set
+                  when the PodVolumeBackup is in InProgress phase
+                type: boolean
              node:
                description: Node is the name of the node that the Pod is running
                  on.
@@ -165,6 +179,13 @@ spec:
          status:
            description: PodVolumeBackupStatus is the current status of a PodVolumeBackup.
            properties:
+              acceptedTimestamp:
+                description: |-
+                  AcceptedTimestamp records the time the pod volume backup is to be prepared.
+                  The server's time is used for AcceptedTimestamp
+                format: date-time
+                nullable: true
+                type: string
              completionTimestamp:
                description: |-
                  CompletionTimestamp records the time a backup was completed.
@@ -174,6 +195,11 @@ spec:
                format: date-time
                nullable: true
                type: string
+              incrementalBytes:
+                description: IncrementalBytes holds the number of bytes new or changed
+                  since the last backup
+                format: int64
+                type: integer
              message:
                description: Message is a message about the pod volume backup's status.
                type: string
@@ -185,7 +211,11 @@ spec:
                description: Phase is the current state of the PodVolumeBackup.
                enum:
                - New
+                - Accepted
+                - Prepared
                - InProgress
+                - Canceling
+                - Canceled
                - Completed
                - Failed
                type: string
--- a/config/crd/v1/bases/velero.io_podvolumerestores.yaml
+++ b/config/crd/v1/bases/velero.io_podvolumerestores.yaml
@@ -15,39 +15,40 @@ spec:
  scope: Namespaced
  versions:
  - additionalPrinterColumns:
-    - description: Namespace of the pod containing the volume to be restored
-      jsonPath: .spec.pod.namespace
-      name: Namespace
+    - description: PodVolumeRestore status such as New/InProgress
+      jsonPath: .status.phase
+      name: Status
      type: string
-    - description: Name of the pod containing the volume to be restored
-      jsonPath: .spec.pod.name
-      name: Pod
+    - description: Time duration since this PodVolumeRestore was started
+      jsonPath: .status.startTimestamp
+      name: Started
+      type: date
+    - description: Completed bytes
+      format: int64
+      jsonPath: .status.progress.bytesDone
+      name: Bytes Done
+      type: integer
+    - description: Total bytes
+      format: int64
+      jsonPath: .status.progress.totalBytes
+      name: Total Bytes
+      type: integer
+    - description: Name of the Backup Storage Location where the backup data is stored
+      jsonPath: .spec.backupStorageLocation
+      name: Storage Location
+      type: string
+    - description: Time duration since this PodVolumeRestore was created
+      jsonPath: .metadata.creationTimestamp
+      name: Age
+      type: date
+    - description: Name of the node where the PodVolumeRestore is processed
+      jsonPath: .status.node
+      name: Node
      type: string
    - description: The type of the uploader to handle data transfer
      jsonPath: .spec.uploaderType
      name: Uploader Type
      type: string
-    - description: Name of the volume to be restored
-      jsonPath: .spec.volume
-      name: Volume
-      type: string
-    - description: Pod Volume Restore status such as New/InProgress
-      jsonPath: .status.phase
-      name: Status
-      type: string
-    - description: Pod Volume Restore status such as New/InProgress
-      format: int64
-      jsonPath: .status.progress.totalBytes
-      name: TotalBytes
-      type: integer
-    - description: Pod Volume Restore status such as New/InProgress
-      format: int64
-      jsonPath: .status.progress.bytesDone
-      name: BytesDone
-      type: integer
-    - jsonPath: .metadata.creationTimestamp
-      name: Age
-      type: date
    name: v1
    schema:
      openAPIV3Schema:
@@ -77,6 +78,11 @@ spec:
                  BackupStorageLocation is the name of the backup storage location
                  where the backup repository is stored.
                type: string
+              cancel:
+                description: |-
+                  Cancel indicates request to cancel the ongoing PodVolumeRestore. It can be set
+                  when the PodVolumeRestore is in InProgress phase
+                type: boolean
              pod:
                description: Pod is a reference to the pod containing the volume to
                  be restored.
@@ -127,6 +133,10 @@ spec:
              snapshotID:
                description: SnapshotID is the ID of the volume snapshot to be restored.
                type: string
+              snapshotSize:
+                description: SnapshotSize is the logical size in Bytes of the snapshot.
+                format: int64
+                type: integer
              sourceNamespace:
                description: SourceNamespace is the original namespace for namaspace
                  mapping.
@@ -162,6 +172,13 @@ spec:
          status:
            description: PodVolumeRestoreStatus is the current status of a PodVolumeRestore.
            properties:
+              acceptedTimestamp:
+                description: |-
+                  AcceptedTimestamp records the time the pod volume restore is to be prepared.
+                  The server's time is used for AcceptedTimestamp
+                format: date-time
+                nullable: true
+                type: string
              completionTimestamp:
                description: |-
                  CompletionTimestamp records the time a restore was completed.
@@ -173,11 +190,19 @@ spec:
              message:
                description: Message is a message about the pod volume restore's status.
                type: string
+              node:
+                description: Node is name of the node where the pod volume restore
+                  is processed.
+                type: string
              phase:
                description: Phase is the current state of the PodVolumeRestore.
                enum:
                - New
+                - Accepted
+                - Prepared
                - InProgress
+                - Canceling
+                - Canceled
                - Completed
                - Failed
                type: string
--- a/config/crd/v1/bases/velero.io_schedules.yaml
+++ b/config/crd/v1/bases/velero.io_schedules.yaml
@@ -549,6 +549,10 @@ spec:
                          uploads to perform when using the uploader.
                        type: integer
                    type: object
+                  volumeGroupSnapshotLabelKey:
+                    description: VolumeGroupSnapshotLabelKey specifies the label key
+                      to group PVCs under a VGS.
+                    type: string
                  volumeSnapshotLocations:
                    description: VolumeSnapshotLocations is a list containing names
                      of VolumeSnapshotLocations associated with this backup.
--- a/config/crd/v1/crds/crds.go
+++ b/config/crd/v1/crds/crds.go
--- a/config/crd/v2alpha1/bases/velero.io_datadownloads.yaml
+++ b/config/crd/v2alpha1/bases/velero.io_datadownloads.yaml
@@ -108,6 +108,10 @@ spec:
                description: SnapshotID is the ID of the Velero backup snapshot to
                  be restored from.
                type: string
+              snapshotSize:
+                description: SnapshotSize is the logical size in Bytes of the snapshot.
+                format: int64
+                type: integer
              sourceNamespace:
                description: |-
                  SourceNamespace is the original namespace where the volume is backed up from.
--- a/config/crd/v2alpha1/bases/velero.io_datauploads.yaml
+++ b/config/crd/v2alpha1/bases/velero.io_datauploads.yaml
@@ -33,6 +33,12 @@ spec:
      jsonPath: .status.progress.totalBytes
      name: Total Bytes
      type: integer
+    - description: Incremental bytes
+      format: int64
+      jsonPath: .status.incrementalBytes
+      name: Incremental Bytes
+      priority: 10
+      type: integer
    - description: Name of the Backup Storage Location where this backup should be
        stored
      jsonPath: .spec.backupStorageLocation
@@ -87,6 +93,9 @@ spec:
                  of the CSI snapshot.
                nullable: true
                properties:
+                  driver:
+                    description: Driver is the driver used by the VolumeSnapshotContent
+                    type: string
                  snapshotClass:
                    description: SnapshotClass is the name of the snapshot class that
                      the volume snapshot is created with
@@ -170,6 +179,11 @@ spec:
                  as a result of the DataUpload.
                nullable: true
                type: object
+              incrementalBytes:
+                description: IncrementalBytes holds the number of bytes new or changed
+                  since the last backup
+                format: int64
+                type: integer
              message:
                description: Message is a message about the DataUpload's status.
                type: string
--- a/config/crd/v2alpha1/crds/crds.go
+++ b/config/crd/v2alpha1/crds/crds.go
--- a/design/Implemented/apply-flag.md
+++ b/design/Implemented/apply-flag.md
@@ -0,0 +1,70 @@
+# Apply flag for install command
+
+## Abstract
+Add an `--apply` flag to the install command that enables applying existing resources rather than creating them. This can be useful as part of the upgrade process for existing installations.
+
+## Background
+The current Velero install command creates resources but doesn't provide a direct way to apply updates to an existing installation.
+Users attempting to run the install command on an existing installation receive "already exists" messages.
+Upgrade steps for existing installs typically involve a three (or more) step process to apply updated CRDs (using `--dry-run` and piping to `kubectl apply`) and then updating/setting images on the Velero deployment and node-agent.
+
+## Goals
+- Provide a simple flag to enable applying resources on an existing Velero installation.
+- Use server-side apply to update existing resources rather than attempting to create them.
+- Maintain consistency with the regular install flow.
+
+## Non Goals
+- Implement special logic for specific version-to-version upgrades (i.e. resource deletion, etc).
+- Add complex upgrade validation or pre/post-upgrade hooks.
+- Provide rollback capabilities.
+
+## High-Level Design
+The `--apply` flag will be added to the Velero install command.
+When this flag is set, the installation process will use server-side apply to update existing resources instead of using create on new resources.
+This flag can be used as _part_ of the upgrade process, but will not always fully handle an upgrade.
+
+## Detailed Design
+The implementation adds a new boolean flag `--apply` to the install command.
+This flag will be passed through to the underlying install functions where the resource creation logic resides.
+
+When the flag is set to true:
+- The `createOrApplyResource` function will use server-side apply with field manager "velero-cli" and `force=true` to update resources.
+- Resources will be applied in the same order as they would be created during installation.
+- Custom Resource Definitions will still be processed first, and the system will wait for them to be established before continuing.
+
+The server-side apply approach with `force=true` ensures that resources are updated even if there are conflicts with the last applied state.
+This provides a best-effort mechanism to apply resources that follows the same flow as installation but updates resources instead of creating them.
+
+No special handling is added for specific versions or resource structures, making this a general-purpose mechanism for applying resources.
+
+## Alternatives Considered
+1. Creating a separate `upgrade` command that would duplicate much of the install command logic.
+   - Rejected due to code duplication and maintenance overhead.
+
+2. Implementing version-specific upgrade logic to handle breaking changes between versions.
+   - Rejected as overly complex and difficult to maintain across multiple version paths.
+   - This could be considered again in the future, but is not in the scope of the current design.
+
+3. Adding automatic detection of existing resources and switching to apply mode.
+   - Rejected as it could lead to unexpected behavior and confusion if users unintentionally apply changes to existing resources.
+
+## Security Considerations
+The apply flag maintains the same security profile as the install command.
+No additional permissions are required beyond what is needed for resource creation.
+The use of `force=true` with server-side apply could potentially override manual changes made to resources, but this is a necessary trade-off to ensure apply is successful.
+
+## Compatibility
+This enhancement is compatible with all existing Velero installations as it is a new opt-in flag.
+It does not change any resource formats or API contracts.
+The apply process is best-effort and does not guarantee compatibility between arbitrary versions of Velero.
+Users should still consult release notes for any breaking changes that may require manual intervention.
+This flag could be adopted by the helm chart, specifically for CRD updates, to simplify the CRD update job.
+
+## Implementation
+The implementation involves:
+1. Adding support for `Apply` to the existing Kubernetes client code.
+1. Adding the `--apply` flag to the install command options.
+1. Changing `createResource` to `createOrApplyResource` and updating it to use server-side apply when the `apply` boolean is set.
+
+The implementation is straightforward and follows existing code patterns.
+No migration of state or special handling of specific resources is required.
--- a/design/Implemented/clean_artifacts_in_csi_flow.md
+++ b/design/Implemented/clean_artifacts_in_csi_flow.md
--- a/design/Implemented/include-exclude-in-resource-policy.md
+++ b/design/Implemented/include-exclude-in-resource-policy.md
@@ -0,0 +1,82 @@
+# Proposal to add include exclude policy to resource policy
+
+This enhancement will allow the user to set include and exclude filters for resources in a resource policy configmap, so that
+these filters are reusable and the user will not need to set them each time they create a backup.
+
+## Background
+As mentioned in issue [#8610](https://github.com/vmware-tanzu/velero/issues/8610).  When there's a long list of resources 
+to include or exclude in a backup, it can be cumbersome to set them each time a backup is created.  There's a requirement to
+set these filters in a separate data structure so that they can be reused in multiple backups.
+
+## High-Level Design
+We may extend the data structure of resource policy to add `includeExcludePolicy`, which include the include and exclude filters 
+in the BackupSpec.  When the user creates a backup which references the resource policy config `velero backup create --resource-policies-configmap <configmap-name>`,
+the filters in "includeExcludePolicy" will take effect to filter the resources when velero collects the resources to backup.
+
+## Detailed Design
+
+### Data Structure
+The map `includeExcludePolicy` contains four fields `includedClusterScopedResources`, `excludedClusterScopedResources`, 
+`includedNamespaceScopedResources`,`excludedNamespaceScopedResources`.  These filters work exactly as the filters defined BackupSpec with
+the same names.  An example of the policy looks like:
+```yaml
+#omitted other irrelevant fields like 'version', 'volumePolicies'
+includeExcludePolicy:
+  includedClusterScopedResources:
+    - "cr"
+    - "crd"
+    - "pv"
+  excludedClusterScopedResources:
+    - "volumegroupsnapshotclass"
+    - "ingressclass"
+  includedNamespaceScopedResources:
+    - "pod"
+    - "service"
+    - "deployment"
+    - "pvc"
+  excludedNamespaceScopedResources:
+    - "configmap"
+```
+These filters are in the form of scoped include/exclude filters, which by design will not work with the "old" resource filters.
+Therefore, when a Backup references a resource policy configmap which has `includeExcludePolicy`, and at the same time it has 
+the "old" resource filters, i.e. `includedResources`, `excludedResources`, `includeClusterResources` set in the BackupSpec, the
+Backup will fail with a validation error.
+
+### Priorities 
+A user may set the include/exclude filters in Backupspec and also in the resource policy configmap.  In this case, the filters 
+in both the Backupspec and the resource policy configmap will take effect.  When there's a conflict, the filters in the Backupspec 
+will take precedence.  For example, if resource X is in the list of `includedNamespaceScopedResources` filter in the Backupspec, but 
+it's also in the list of `excludedClusterScopedResources` in the resource policy configmap, then resource X will be included in the backup.
+In this way, users can set the filters in the resource policy configmap to cover most of their use cases, and then override them 
+in the Backupspec when needed.
+
+### Implementation
+In addition to the data structure change, we will need to implement the following changes:
+1. A new function `CombineWithPolicy` will be added to the struct `ScopeIncludesExcludes`, which will combine the include/exclude filters
+in the resource policy configmap with the include/exclude filters in the Backupspec:  
+```go
+func (ie *ScopeIncludesExcludes) CombineWithPolicy(policy resourcepolicies.IncludeExcludePolicy) {
+	mapFunc := scopeResourceMapFunc(ie.helper)
+	for _, item := range policy.ExcludedNamespaceScopedResources {
+		resolvedItem := mapFunc(item, true)
+		if resolvedItem == "" {
+			continue
+		}
+		if !ie.ShouldInclude(resolvedItem) && !ie.ShouldExclude(resolvedItem) {
+			// The existing includeExcludes in the struct has higher priority, therefore, we should only add the item to the filter
+			// when the struct does not include this item and this item is not yet in the excludes filter.
+			ie.namespaceScopedResourceFilter.excludes.Insert(resolvedItem)
+		}
+		
+	}
+.....
+```
+This function will be called in the `kubernetesBackupper.BackupWithResolvers` function, to make sure the combined `ScopeIncludesExcludes` 
+filter will be assigned to the `ResourceIncludesExcludes` filter of the Backup request.
+
+2. Extra validation code will be added to the function `prepareBackupRequest` of `BackupReconciler` to check if there are "old"
+Resource filters in the BackupSpec when the Backup references a resource policy configmap which has `includeExcludePolicy`.
+
+## Alternatives Considered
+We may put `includeExcludePolicy` in a separate configmap, but it will require adding extra field to BackupSpec to reference the configmap,
+which is not necessary.
--- a/design/Implemented/node-agent-affinity.md
+++ b/design/Implemented/node-agent-affinity.md
@@ -128,5 +128,5 @@ Once this problem happens, the backupPod stays in `Pending` phase, and the corre
 On the other hand, the backupPod is deleted after the prepare timeout, so there is no way to tell the cause is one of the above problems or others.  
 To help the troubleshooting, we can add some diagnostic mechanism to discover the status of the backupPod and node-agent in the same node before deleting it as a result of the prepare timeout.  

-[1]: Implemented/unified-repo-and-kopia-integration/unified-repo-and-kopia-integration.md
+[1]: unified-repo-and-kopia-integration/unified-repo-and-kopia-integration.md
 [2]: volume-snapshot-data-movement/volume-snapshot-data-movement.md
--- a/design/Implemented/node-agent-load-soothing.md
+++ b/design/Implemented/node-agent-load-soothing.md
@@ -0,0 +1,121 @@
+# Node-agent Load Soothing Design
+
+## Glossary & Abbreviation
+
+**Velero Generic Data Path (VGDP)**: VGDP is the collective of modules that is introduced in [Unified Repository design][1]. Velero uses these modules to finish data transfer for various purposes (i.e., PodVolume backup/restore, Volume Snapshot Data Movement). VGDP modules include uploaders and the backup repository.  
+
+## Background
+
+As mentioned in [node-agent Concurrency design][2], [CSI Snapshot Data Movement design][3], [VGDP Micro Service design][4] and [VGDP Micro Service for fs-backup design][5], all data movement activities for CSI snapshot data movement backups/restores and fs-backup respect the `loadConcurrency` settings configured in the `node-agent-configmap`. Once the number of existing loads exceeds the corresponding `loadConcurrency` setting, the loads will be throttled and some loads will be held until VGDP quotas are available.  
+However, this throttling only happens after the data mover pod is started and gets to `running`. As a result, when there are large number of concurrent volume backups, there may be many data mover pods get created but the VGDP instances inside them are actually on hold because of the VGDP throttling.  
+This could cause below problems:
+- In some environments, there is a pod limit in each node of the cluster or a pod limit throughout the cluster, too many of the inactive data mover pods may block other pods from running
+- In some environments, the system disk for each node of the cluster is limited, while pods also occupy system disk space, etc., many of the inactive data mover pods also take unnecessary space from system disk and cause other critical pods evicted
+- For CSI snapshot data movement backup, before creation of the data mover pod, the volume snapshot has also created, this means excessive number of snapshots may also be created and live for longer time since the VGDP won't start until the quota is available. However, in some environments, large number of snapshots is not allowed or may cause degradation of the storage peroformance
+
+On the other hand, the VGDP throttling mentioned in [node-agent Concurrency design][2] is an accurate controlling mechanism, that is, exactly the required number of data mover pods are throttled.  
+
+Therefore, another mechanism is required to soothe the creation of the data mover pods and volume snapshots before the VGDP throttling. It doesn't need to accurately control these creations but should effectively reduce the excessive number of inactive data mover pods and volume snapshots.  
+It is not practical to make an accurate control as it is almost impossible to predict which group of nodes a data mover pod is scheduled to, under the consideration of many complex factors, i.e., selected node, affinity, node OS, etc.  
+
+
+## Goals
+
+- Allow users to configure the expected number of loads pending on waiting for VGDP load concurrency quota
+- Create a soothing mechanism to prevent new loads from starting if the number of existing loads excceds the expected number
+
+## Non-Goals
+- Accurately controlling the loads from initiation is not a goal  
+
+## Solution
+
+We introduce a new field `prepareQueueLength` in `loadConcurrency` of `node-agent-configmap` as the allowed number of loads that are under preparing (expose). Specifically, loads are in this situation after its CR is in `Accepted` and `Prepared` phase. The `prepareQueueLength` should be a positive number, negative numbers will be ignored.  
+Once the value is set, the soothing mechanism takes effect, as the best effort, only the allowed number of CRs go into `Accepted` or `Prepared` phase, others will wait and stay as `New` state; and thereby only the allowed number of data mover pods, volume snapshots are created.  
+Otherwise, node-agent works the same as the legacy behavior, CRs go to `Accepted` or `Prepared` state as soon as the controllers process them and data mover pods and volume snapshots are also created without any constraints.  
+If users want to constrain the excessive number of pending data mover pods and volume snapshots, they could set a value by considering the VGDP load concurrency; otherwise, if they don't see constrains for pods or volume snapshots in their environment, they don't need to use this feature, in parallel preparing could also be beneficial for increasing the concurrency.  
+
+Node-agent server checks this configuration at startup time and use it to initiate the related VGDP modules. Therefore, users could edit this configMap any time, but in order to make the changes effective, node-agent server needs to be restarted.  
+
+The data structure is as below:
+```go
+type LoadConcurrency struct {
+    // GlobalConfig specifies the concurrency number to all nodes for which per-node config is not specified
+    GlobalConfig int `json:"globalConfig,omitempty"`
+
+    // PerNodeConfig specifies the concurrency number to nodes matched by rules
+    PerNodeConfig []RuledConfigs `json:"perNodeConfig,omitempty"`
+
+    // PrepareQueueLength specifies the max number of loads that are under expose
+	PrepareQueueLength int `json:"prepareQueueLength,omitempty"`    
+}
+```
+
+### Sample
+A sample of the ConfigMap is as below:
+```json
+{
+    "loadConcurrency": {
+        "globalConfig": 2,
+        "perNodeConfig": [
+            {
+                "nodeSelector": {
+                    "matchLabels": {
+                        "kubernetes.io/hostname": "node1"
+                    }
+                },
+                "number": 3
+            },
+            {
+                "nodeSelector": {
+                    "matchLabels": {
+                        "beta.kubernetes.io/instance-type": "Standard_B4ms"
+                    }
+                },
+                "number": 5
+            }
+        ],
+        "prepareQueueLength": 2
+    }
+}
+```
+To create the configMap, users need to save something like the above sample to a json file and then run below command:
+```
+kubectl create cm <ConfigMap name> -n velero --from-file=<json file name>
+```
+
+## Detailed Design
+Changes apply to the DataUpload Controller, DataDownload Controller, PodVolumeBackup Controller and PodVolumeRestore Controller, as below:
+1. The soothe happens to data mover CRs (DataUpload, DataDownload, PodVolumeBackup or PodVolumeRestore) that are in `New` state
+2. Before starting processing the CR, the corresponding controller counts the existing CRs under or pending for expose in the cluster, that is a total number of existing DataUpload, DataDownload, PodVolumeBackup and PodVolumeRestore that are in either `Accepted` or `Preparing` state  
+3. If the total number doesn't exceed the allowed number, the controller set the CR's phase to `Accepted`
+4. Once the total number exceeds the allowed number, the controller gives up processing the CR and have it requeued later. The delay for the requeue is 5 seconds
+
+The count happens for all the controllers in all nodes, to prevent the checks drain out the API server, the count happens to controller client caches for those CRs. And the count result is also cached, so that the count only happens whenever necessary. Below shows how it judges the necessity:
+- When one or more CRs' phase change to `Accepted`
+- When one or more CRs' phase change from `Accepted` to one of the terminal phases
+- When one or more CRs' phase change from `Prepared` to one of the terminal phases
+- When one or more CRs' phase change from `Prepared` to `InProgress`
+
+Ideally, 2~3 in the above steps need to be synchornized among controllers in all nodes. However, this synchronization is not implemented, the consideration is as below:    
+1. It is impossible to accurately synchronize the count among controllers in different nodes, because the client cache is not coherrent among nodes.  
+2. It is possible to synchronize the count among controllers in the same node. However, it is too expensive to make this synchronization, because 2~3 are part of the expose workflow, the synchronization impacts the performance and stability of the existing workflow. 
+3. Even without the synchronization, the soothing mechanism still works eventually -- when the controllers see all the discharged loads (expected ones and over-discharged ones), they will stop creating new loads until the quota is available again.  
+4. Step 2~3 that need to be synchronized could complete very quickly.    
+
+This is why we say this mechanism is not an accurate control. Or in another word, it is possible that more loads than the number of `prepareQueueLength` are discharged if controllers make the count and expose in the overlapped time (step 2~3).  
+For example, when multiple controllers of the same type (DataUpload, DataDownload, PodVolumeBackup or PodVolumeRestore) from different nodes make the count:  
+```
+max number of waiting loads = number defined by `prepareQueueLength` + number of nodes in cluster
+```
+As another example, when hybrid loads are running the count concurrently, e.g., mix of data mover backups, data mover restores, pod volume backups or pod volume restores, more loads may be discharged and the number depends on the number of concurrent hybrid loads.  
+In either case, because step 2~3 is short in time, it is less likely to reach the theoretically worset result.  
+
+
+
+
+
+[1]: unified-repo-and-kopia-integration/unified-repo-and-kopia-integration.md
+[2]: node-agent-concurrency.md
+[3]: volume-snapshot-data-movement/volume-snapshot-data-movement.md
+[4]: vgdp-micro-service/vgdp-micro-service.md
+[5]: vgdp-micro-service-for-fs-backup/vgdp-micro-service-for-fs-backup.md
--- a/design/Implemented/priority-class-name-support_design.md
+++ b/design/Implemented/priority-class-name-support_design.md
@@ -0,0 +1,694 @@
+# PriorityClass Support Design Proposal
+
+## Abstract
+
+This design document outlines the implementation of priority class name support for Velero components, including the Velero server deployment, node agent daemonset, and maintenance jobs. This feature allows users to specify a priority class name for Velero components, which can be used to influence the scheduling and eviction behavior of these components.
+
+## Background
+
+Kubernetes allows users to define priority classes, which can be used to influence the scheduling and eviction behavior of pods. Priority classes are defined as cluster-wide resources, and pods can reference them by name. When a pod is created, the priority admission controller uses the priority class name to populate the priority value for the pod. The scheduler then uses this priority value to determine the order in which pods are scheduled.
+
+Currently, Velero does not provide a way for users to specify a priority class name for its components. This can be problematic in clusters where resource contention is high, as Velero components may be evicted or not scheduled in a timely manner, potentially impacting backup and restore operations.
+
+## Goals
+
+- Add support for specifying priority class names for Velero components
+- Update the Velero CLI to accept priority class name parameters for different components
+- Update the Velero deployment, node agent daemonset, maintenance jobs, and data mover pods to use the specified priority class names
+
+## Non Goals
+
+- Creating or managing priority classes
+- Automatically determining the appropriate priority class for Velero components
+
+## High-Level Design
+
+The implementation will add new fields to the Velero options struct to store the priority class names for the server deployment and node agent daemonset. The Velero CLI will be updated to accept new flags for these components. For data mover pods and maintenance jobs, priority class names will be configured through existing ConfigMap mechanisms (`node-agent-configmap` for data movers and `repo-maintenance-job-configmap` for maintenance jobs). The Velero deployment, node agent daemonset, maintenance jobs, and data mover pods will be updated to use their respective priority class names.
+
+## Detailed Design
+
+### CLI Changes
+
+New flags will be added to the `velero install` command to specify priority class names for different components:
+
+```go
+flags.StringVar(
+    &o.ServerPriorityClassName,
+    "server-priority-class-name",
+    o.ServerPriorityClassName,
+    "Priority class name for the Velero server deployment. Optional.",
+)
+
+flags.StringVar(
+    &o.NodeAgentPriorityClassName,
+    "node-agent-priority-class-name",
+    o.NodeAgentPriorityClassName,
+    "Priority class name for the node agent daemonset. Optional.",
+)
+```
+
+Note: Priority class names for data mover pods and maintenance jobs will be configured through their respective ConfigMaps (`--node-agent-configmap` for data movers and `--repo-maintenance-job-configmap` for maintenance jobs).
+
+### Velero Options Changes
+
+The `VeleroOptions` struct in `pkg/install/resources.go` will be updated to include new fields for priority class names:
+
+```go
+type VeleroOptions struct {
+    // ... existing fields ...
+    ServerPriorityClassName       string
+    NodeAgentPriorityClassName    string
+}
+```
+
+### Deployment Changes
+
+The `podTemplateConfig` struct in `pkg/install/deployment.go` will be updated to include a new field for the priority class name:
+
+```go
+type podTemplateConfig struct {
+    // ... existing fields ...
+    priorityClassName string
+}
+```
+
+A new function, `WithPriorityClassName`, will be added to set this field:
+
+```go
+func WithPriorityClassName(priorityClassName string) podTemplateOption {
+    return func(c *podTemplateConfig) {
+        c.priorityClassName = priorityClassName
+    }
+}
+```
+
+The `Deployment` function will be updated to use the priority class name:
+
+```go
+deployment := &appsv1api.Deployment{
+    // ... existing fields ...
+    Spec: appsv1api.DeploymentSpec{
+        // ... existing fields ...
+        Template: corev1api.PodTemplateSpec{
+            // ... existing fields ...
+            Spec: corev1api.PodSpec{
+                // ... existing fields ...
+                PriorityClassName: c.priorityClassName,
+            },
+        },
+    },
+}
+```
+
+### DaemonSet Changes
+
+The `DaemonSet` function will use the priority class name passed via the podTemplateConfig (from the CLI flag):
+
+```go
+daemonSet := &appsv1api.DaemonSet{
+    // ... existing fields ...
+    Spec: appsv1api.DaemonSetSpec{
+        // ... existing fields ...
+        Template: corev1api.PodTemplateSpec{
+            // ... existing fields ...
+            Spec: corev1api.PodSpec{
+                // ... existing fields ...
+                PriorityClassName: c.priorityClassName,
+            },
+        },
+    },
+}
+```
+
+### Maintenance Job Changes
+
+The `JobConfigs` struct in `pkg/repository/maintenance/maintenance.go` will be updated to include a field for the priority class name:
+
+```go
+type JobConfigs struct {
+    // LoadAffinities is the config for repository maintenance job load affinity.
+    LoadAffinities []*kube.LoadAffinity `json:"loadAffinity,omitempty"`
+
+    // PodResources is the config for the CPU and memory resources setting.
+    PodResources *kube.PodResources `json:"podResources,omitempty"`
+    
+    // PriorityClassName is the priority class name for the maintenance job pod
+    // Note: This is only read from the global configuration, not per-repository
+    PriorityClassName string `json:"priorityClassName,omitempty"`
+}
+```
+
+The `buildJob` function will be updated to use the priority class name from the global job configuration:
+
+```go
+func buildJob(cli client.Client, ctx context.Context, repo *velerov1api.BackupRepository, bslName string, config *JobConfigs,
+    podResources kube.PodResources, logLevel logrus.Level, logFormat *logging.FormatFlag) (*batchv1.Job, error) {
+    // ... existing code ...
+    
+    // Use the priority class name from the global job configuration if available
+    // Note: Priority class is only read from global config, not per-repository
+    priorityClassName := ""
+    if config != nil && config.PriorityClassName != "" {
+        priorityClassName = config.PriorityClassName
+    }
+    
+    // ... existing code ...
+    
+    job := &batchv1.Job{
+        // ... existing fields ...
+        Spec: batchv1.JobSpec{
+            // ... existing fields ...
+            Template: corev1api.PodTemplateSpec{
+                // ... existing fields ...
+                Spec: corev1api.PodSpec{
+                    // ... existing fields ...
+                    PriorityClassName: priorityClassName,
+                },
+            },
+        },
+    }
+    
+    // ... existing code ...
+}
+```
+
+Users will be able to configure the priority class name for all maintenance jobs by creating the repository maintenance job ConfigMap before installation. For example:
+
+```bash
+# Create the ConfigMap before running velero install
+cat <<EOF | kubectl create configmap repo-maintenance-job-config -n velero --from-file=config.json=/dev/stdin
+{
+    "global": {
+        "priorityClassName": "low-priority",
+        "podResources": {
+            "cpuRequest": "100m",
+            "memoryRequest": "128Mi"
+        }
+    }
+}
+EOF
+
+# Then install Velero referencing this ConfigMap
+velero install --provider aws \
+    --repo-maintenance-job-configmap repo-maintenance-job-config \
+    # ... other flags
+```
+
+The ConfigMap can be updated after installation to change the priority class for future maintenance jobs. Note that only the "global" configuration is used for priority class - all maintenance jobs will use the same priority class regardless of which repository they are maintaining.
+
+### Node Agent ConfigMap Changes
+
+We'll update the `Configs` struct in `pkg/nodeagent/node_agent.go` to include a field for the priority class name in the node-agent-configmap:
+
+```go
+type Configs struct {
+    // ... existing fields ...
+    
+    // PriorityClassName is the priority class name for the data mover pods 
+    // created by the node agent
+    PriorityClassName string `json:"priorityClassName,omitempty"`
+}
+```
+
+This will allow users to configure the priority class name for data mover pods through the node-agent-configmap. Note that the node agent daemonset itself gets its priority class from the `--node-agent-priority-class-name` CLI flag during installation, not from this configmap. For example:
+
+```bash
+# Create the ConfigMap before running velero install
+cat <<EOF | kubectl create configmap node-agent-config -n velero --from-file=config.json=/dev/stdin
+{
+    "priorityClassName": "low-priority",
+    "loadAffinity": [
+        {
+            "nodeSelector": {
+                "matchLabels": {
+                    "node-role.kubernetes.io/worker": "true"
+                }
+            }
+        }
+    ]
+}
+EOF
+
+# Then install Velero referencing this ConfigMap
+velero install --provider aws \
+    --node-agent-configmap node-agent-config \
+    --use-node-agent \
+    # ... other flags
+```
+
+The `createBackupPod` function in `pkg/exposer/csi_snapshot.go` will be updated to accept and use the priority class name:
+
+```go
+func (e *csiSnapshotExposer) createBackupPod(
+    ctx context.Context,
+    ownerObject corev1api.ObjectReference,
+    backupPVC *corev1api.PersistentVolumeClaim,
+    operationTimeout time.Duration,
+    label map[string]string,
+    annotation map[string]string,
+    affinity *kube.LoadAffinity,
+    resources corev1api.ResourceRequirements,
+    backupPVCReadOnly bool,
+    spcNoRelabeling bool,
+    nodeOS string,
+    priorityClassName string, // New parameter
+) (*corev1api.Pod, error) {
+    // ... existing code ...
+    
+    pod := &corev1api.Pod{
+        // ... existing fields ...
+        Spec: corev1api.PodSpec{
+            // ... existing fields ...
+            PriorityClassName: priorityClassName,
+            // ... existing fields ...
+        },
+    }
+    
+    // ... existing code ...
+}
+```
+
+The call to `createBackupPod` in the `Expose` method will be updated to pass the priority class name retrieved from the node-agent-configmap:
+
+```go
+priorityClassName, _ := kube.GetDataMoverPriorityClassName(ctx, namespace, kubeClient, configMapName)
+backupPod, err := e.createBackupPod(
+    ctx,
+    ownerObject,
+    backupPVC,
+    csiExposeParam.OperationTimeout,
+    csiExposeParam.HostingPodLabels,
+    csiExposeParam.HostingPodAnnotations,
+    csiExposeParam.Affinity,
+    csiExposeParam.Resources,
+    backupPVCReadOnly,
+    spcNoRelabeling,
+    csiExposeParam.NodeOS,
+    priorityClassName, // Priority class name from node-agent-configmap
+)
+```
+
+A new function, `GetDataMoverPriorityClassName`, will be added to the `pkg/util/kube` package (in the same file as `ValidatePriorityClass`) to retrieve the priority class name for data mover pods:
+
+```go
+// In pkg/util/kube/priority_class.go
+
+// GetDataMoverPriorityClassName retrieves the priority class name for data mover pods from the node-agent-configmap
+func GetDataMoverPriorityClassName(ctx context.Context, namespace string, kubeClient kubernetes.Interface, configName string) (string, error) {
+    // configData is a minimal struct to parse only the priority class name from the ConfigMap
+    type configData struct {
+        PriorityClassName string `json:"priorityClassName,omitempty"`
+    }
+
+    // Get the ConfigMap
+    cm, err := kubeClient.CoreV1().ConfigMaps(namespace).Get(ctx, configName, metav1.GetOptions{})
+    if err != nil {
+        if apierrors.IsNotFound(err) {
+            // ConfigMap not found is not an error, just return empty string
+            return "", nil
+        }
+        return "", errors.Wrapf(err, "error getting node agent config map %s", configName)
+    }
+
+    if cm.Data == nil {
+        // No data in ConfigMap, return empty string
+        return "", nil
+    }
+
+    // Extract the first value from the ConfigMap data
+    jsonString := ""
+    for _, v := range cm.Data {
+        jsonString = v
+        break // Use the first value found
+    }
+
+    if jsonString == "" {
+        // No data to parse, return empty string
+        return "", nil
+    }
+
+    // Parse the JSON to extract priority class name
+    var config configData
+    if err := json.Unmarshal([]byte(jsonString), &config); err != nil {
+        // Invalid JSON is not a critical error for priority class
+        // Just return empty string to use default behavior
+        return "", nil
+    }
+
+    return config.PriorityClassName, nil
+}
+```
+
+This function will get the priority class name from the node-agent-configmap. If it's not found, it will return an empty string.
+
+### Validation and Logging
+
+To improve observability and help with troubleshooting, the implementation will include:
+
+1. **Optional Priority Class Validation**: A helper function to check if a priority class exists in the cluster. This function will be added to the `pkg/util/kube` package alongside other Kubernetes utility functions:
+
+```go
+// In pkg/util/kube/priority_class.go
+
+// ValidatePriorityClass checks if the specified priority class exists in the cluster
+// Returns true if the priority class exists or if priorityClassName is empty
+// Returns false if the priority class doesn't exist or validation fails
+// Logs warnings when the priority class doesn't exist
+func ValidatePriorityClass(ctx context.Context, kubeClient kubernetes.Interface, priorityClassName string, logger logrus.FieldLogger) bool {
+    if priorityClassName == "" {
+        return true
+    }
+    
+    _, err := kubeClient.SchedulingV1().PriorityClasses().Get(ctx, priorityClassName, metav1.GetOptions{})
+    if err != nil {
+        if apierrors.IsNotFound(err) {
+            logger.Warnf("Priority class %q not found in cluster. Pod creation may fail if the priority class doesn't exist when pods are scheduled.", priorityClassName)
+        } else {
+            logger.WithError(err).Warnf("Failed to validate priority class %q", priorityClassName)
+        }
+        return false
+    }
+    logger.Infof("Validated priority class %q exists in cluster", priorityClassName)
+    return true
+}
+```
+
+2. **Debug Logging**: Add debug logs when priority classes are applied:
+
+```go
+// In deployment creation
+if c.priorityClassName != "" {
+    logger.Debugf("Setting priority class %q for Velero server deployment", c.priorityClassName)
+}
+
+// In daemonset creation
+if c.priorityClassName != "" {
+    logger.Debugf("Setting priority class %q for node agent daemonset", c.priorityClassName)
+}
+
+// In maintenance job creation
+if priorityClassName != "" {
+    logger.Debugf("Setting priority class %q for maintenance job %s", priorityClassName, job.Name)
+}
+
+// In data mover pod creation
+if priorityClassName != "" {
+    logger.Debugf("Setting priority class %q for data mover pod %s", priorityClassName, pod.Name)
+}
+```
+
+These validation and logging features will help administrators:
+
+- Identify configuration issues early (validation warnings)
+- Troubleshoot priority class application issues
+- Verify that priority classes are being applied as expected
+
+The `ValidatePriorityClass` function should be called at the following points:
+
+1. **During `velero install`**: Validate the priority classes specified via CLI flags:
+   - After parsing `--server-priority-class-name` flag
+   - After parsing `--node-agent-priority-class-name` flag
+
+2. **When reading from ConfigMaps**: Validate priority classes when loading configurations:
+   - In `GetDataMoverPriorityClassName` when reading from node-agent-configmap
+   - In maintenance job controller when reading from repo-maintenance-job-configmap
+
+3. **During pod/job creation** (optional, for runtime validation):
+   - Before creating data mover pods (PVB/PVR/CSI snapshot data movement)
+   - Before creating maintenance jobs
+
+Example usage:
+
+```go
+// During velero install
+if o.ServerPriorityClassName != "" {
+    _ = kube.ValidatePriorityClass(ctx, kubeClient, o.ServerPriorityClassName, logger.WithField("component", "server"))
+    // For install command, we continue even if validation fails (warnings are logged)
+}
+
+// When reading from ConfigMap in node-agent server
+priorityClassName, err := kube.GetDataMoverPriorityClassName(ctx, namespace, kubeClient, configMapName)
+if err == nil && priorityClassName != "" {
+    // Validate the priority class exists in the cluster
+    if kube.ValidatePriorityClass(ctx, kubeClient, priorityClassName, logger.WithField("component", "data-mover")) {
+        dataMovePriorityClass = priorityClassName
+        logger.WithField("priorityClassName", priorityClassName).Info("Using priority class for data mover pods")
+    } else {
+        logger.WithField("priorityClassName", priorityClassName).Warn("Priority class not found in cluster, data mover pods will use default priority")
+        // Clear the priority class to prevent pod creation failures
+        priorityClassName = ""
+    }
+}
+```
+
+Note: The validation function returns a boolean to allow callers to decide how to handle missing priority classes. For the install command, validation failures are ignored (only warnings are logged) to allow for scenarios where priority classes might be created after Velero installation. For runtime components like the node-agent server, the priority class is cleared if validation fails to prevent pod creation failures.
+
+## Alternatives Considered
+
+1. **Using a single flag for all components**: We could have used a single flag for all components, but this would not allow for different priority classes for different components. Since maintenance jobs and data movers typically require lower priority than the Velero server, separate flags provide more flexibility.
+
+2. **Using a configuration file**: We could have added support for specifying the priority class names in a configuration file. However, this would have required additional changes to the Velero CLI and would have been more complex to implement.
+
+3. **Inheriting priority class from parent components**: We initially considered having maintenance jobs inherit their priority class from the Velero server, and data movers inherit from the node agent. However, this approach doesn't allow for the appropriate prioritization of different components based on their importance and resource requirements.
+
+## Security Considerations
+
+There are no security considerations for this feature.
+
+## Compatibility
+
+This feature is compatible with all Kubernetes versions that support priority classes. The PodPriority feature became stable in Kubernetes 1.14. For more information, see the [Kubernetes documentation on Pod Priority and Preemption](https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/).
+
+## ConfigMap Update Strategy
+
+### Static ConfigMap Reading at Startup
+
+The node-agent server reads and parses the ConfigMap once during initialization and passes configurations (like `podResources`, `loadAffinity`, and `priorityClassName`) directly to controllers as parameters. This approach ensures:
+
+- Single ConfigMap read to minimize API calls
+- Consistent configuration across all controllers
+- Validation of priority classes at startup with fallback behavior
+- No need for complex update mechanisms or watchers
+
+ConfigMap changes require a restart of the node-agent to take effect.
+
+### Implementation Approach
+
+1. **Data Mover Controllers**: Receive priority class as a string parameter from node-agent server at initialization
+2. **Maintenance Job Controller**: Read fresh configuration from repo-maintenance-job-configmap at job creation time
+3. ConfigMap changes require restart of components to take effect
+4. Priority class validation happens at startup with automatic fallback to prevent failures
+
+## Implementation
+
+The implementation will involve the following steps:
+
+1. Add the priority class name fields for server and node agent to the `VeleroOptions` struct
+2. Add the priority class name field to the `podTemplateConfig` struct
+3. Add the `WithPriorityClassName` function for the server deployment and daemonset
+4. Update the `Deployment` function to use the server priority class name
+5. Update the `DaemonSet` function to use the node agent priority class name
+6. Update the `JobConfigs` struct to include `PriorityClassName` field
+7. Update the `buildJob` function in maintenance job to use the priority class name from JobConfigs (global config only)
+8. Update the `Configs` struct in node agent to include `PriorityClassName` field for data mover pods
+9. Update the data mover pod creation to use the priority class name from node-agent-configmap
+10. Update the PodVolumeBackup controller to retrieve and apply priority class name from node-agent-configmap
+11. Update the PodVolumeRestore controller to retrieve and apply priority class name from node-agent-configmap
+12. Add the `GetDataMoverPriorityClassName` utility function to retrieve priority class from configmap
+13. Add the priority class name flags for server and node agent to the `velero install` command
+14. Add unit tests for:
+    - `WithPriorityClassName` function
+    - `GetDataMoverPriorityClassName` function
+    - Priority class application in deployment, daemonset, and job specs
+15. Add integration tests to verify:
+    - Priority class is correctly applied to all component pods
+    - ConfigMap updates are reflected in new pods
+    - Empty/missing priority class names are handled gracefully
+16. Update user documentation to include:
+    - How to configure priority classes for each component
+    - Examples of creating ConfigMaps before installation
+    - Expected priority class hierarchy recommendations
+    - Troubleshooting guide for priority class issues
+17. Update CLI documentation for new flags (`--server-priority-class-name` and `--node-agent-priority-class-name`)
+
+Note: The server deployment and node agent daemonset will have CLI flags for priority class. Data mover pods and maintenance jobs will use their respective ConfigMaps for priority class configuration.
+
+This approach ensures that different Velero components can use different priority class names based on their importance and resource requirements:
+
+1. The Velero server deployment can use a higher priority class to ensure it continues running even under resource pressure.
+2. The node agent daemonset can use a medium priority class.
+3. Maintenance jobs can use a lower priority class since they should not run when resources are limited.
+4. Data mover pods can use a lower priority class since they should not run when resources are limited.
+
+### Implementation Considerations
+
+Priority class names are configured through different mechanisms:
+
+1. **Server Deployment**: Uses the `--server-priority-class-name` CLI flag during installation.
+
+2. **Node Agent DaemonSet**: Uses the `--node-agent-priority-class-name` CLI flag during installation.
+
+3. **Data Mover Pods**: Will use the node-agent-configmap (specified via the `--node-agent-configmap` flag). This ConfigMap controls priority class for all data mover pods (including PVB and PVR) created by the node agent.
+
+4. **Maintenance Jobs**: Will use the repository maintenance job ConfigMap (specified via the `--repo-maintenance-job-configmap` flag). Users should create this ConfigMap before running `velero install` with the desired priority class configuration. The ConfigMap can be updated after installation to change priority classes for future maintenance jobs. While the ConfigMap structure supports per-repository configuration for resources and affinity, priority class is intentionally only read from the global configuration to ensure all maintenance jobs have the same priority.
+
+#### ConfigMap Pre-Creation Guide
+
+For components that use ConfigMaps for priority class configuration, the ConfigMaps must be created before running `velero install`. Here's the recommended workflow:
+
+```bash
+# Step 1: Create priority classes in your cluster (if not already existing)
+kubectl apply -f - <<EOF
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: velero-critical
+value: 100
+globalDefault: false
+description: "Critical priority for Velero server"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: velero-standard
+value: 50
+globalDefault: false
+description: "Standard priority for Velero node agent"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: velero-low
+value: 10
+globalDefault: false
+description: "Low priority for Velero data movers and maintenance jobs"
+EOF
+
+# Step 2: Create the namespace
+kubectl create namespace velero
+
+# Step 3: Create ConfigMaps for data movers and maintenance jobs
+kubectl create configmap node-agent-config -n velero --from-file=config.json=/dev/stdin <<EOF
+{
+    "priorityClassName": "velero-low"
+}
+EOF
+
+kubectl create configmap repo-maintenance-job-config -n velero --from-file=config.json=/dev/stdin <<EOF
+{
+    "global": {
+        "priorityClassName": "velero-low"
+    }
+}
+EOF
+
+# Step 4: Install Velero with priority class configuration
+velero install \
+    --provider aws \
+    --server-priority-class-name velero-critical \
+    --node-agent-priority-class-name velero-standard \
+    --node-agent-configmap node-agent-config \
+    --repo-maintenance-job-configmap repo-maintenance-job-config \
+    --use-node-agent
+```
+
+#### Recommended Priority Class Hierarchy
+
+When configuring priority classes for Velero components, consider the following hierarchy based on component criticality:
+
+1. **Velero Server (Highest Priority)**:
+   - Example: `velero-critical` with value 100
+   - Rationale: The server must remain running to coordinate backup/restore operations
+
+2. **Node Agent DaemonSet (Medium Priority)**:
+   - Example: `velero-standard` with value 50
+   - Rationale: Node agents need to be available on nodes but are less critical than the server
+
+3. **Data Mover Pods & Maintenance Jobs (Lower Priority)**:
+   - Example: `velero-low` with value 10
+   - Rationale: These are temporary workloads that can be delayed during resource contention
+
+This hierarchy ensures that core Velero components remain operational even under resource pressure, while allowing less critical workloads to be preempted if necessary.
+
+This approach has several advantages:
+
+- Leverages existing configuration mechanisms, minimizing new CLI flags
+- Provides a single point of configuration for related components (node agent and its pods)
+- Allows dynamic configuration updates without requiring Velero reinstallation
+- Maintains backward compatibility with existing installations
+- Enables administrators to set up priority classes during initial deployment
+- Keeps configuration simple by using the same priority class for all maintenance jobs
+
+The priority class name for data mover pods will be determined by checking the node-agent-configmap. This approach provides a centralized way to configure priority class names for all data mover pods. The same approach will be used for PVB (PodVolumeBackup) and PVR (PodVolumeRestore) pods, which will also retrieve their priority class name from the node-agent-configmap.
+
+For PVB and PVR pods specifically, the implementation follows this approach:
+
+1. **Controller Initialization**: Both PodVolumeBackup and PodVolumeRestore controllers are updated to accept a priority class name as a string parameter. The node-agent server reads the priority class from the node-agent-configmap once at startup:
+
+```go
+// In node-agent server startup (pkg/cmd/cli/nodeagent/server.go)
+dataMovePriorityClass := ""
+if s.config.nodeAgentConfig != "" {
+    ctx, cancel := context.WithTimeout(context.Background(), time.Second*30)
+    defer cancel()
+    priorityClass, err := kube.GetDataMoverPriorityClassName(ctx, s.namespace, s.kubeClient, s.config.nodeAgentConfig)
+    if err != nil {
+        s.logger.WithError(err).Warn("Failed to get priority class name from node-agent-configmap, using empty value")
+    } else if priorityClass != "" {
+        // Validate the priority class exists in the cluster
+        if kube.ValidatePriorityClass(ctx, s.kubeClient, priorityClass, s.logger.WithField("component", "data-mover")) {
+            dataMovePriorityClass = priorityClass
+            s.logger.WithField("priorityClassName", priorityClass).Info("Using priority class for data mover pods")
+        } else {
+            s.logger.WithField("priorityClassName", priorityClass).Warn("Priority class not found in cluster, data mover pods will use default priority")
+        }
+    }
+}
+
+// Pass priority class to controllers
+pvbReconciler := controller.NewPodVolumeBackupReconciler(
+    s.mgr.GetClient(), s.mgr, s.kubeClient, ..., dataMovePriorityClass)
+pvrReconciler := controller.NewPodVolumeRestoreReconciler(
+    s.mgr.GetClient(), s.mgr, s.kubeClient, ..., dataMovePriorityClass)
+```
+
+2. **Controller Structure**: Controllers store the priority class name as a field:
+
+```go
+type PodVolumeBackupReconciler struct {
+    // ... existing fields ...
+    dataMovePriorityClass string
+}
+```
+
+3. **Pod Creation**: The priority class is included in the pod spec when creating data mover pods.
+
+### VGDP Micro-Service Considerations
+
+With the introduction of VGDP micro-services (as described in the VGDP micro-service design), data mover pods are created as dedicated pods for volume snapshot data movement. These pods will also inherit the priority class configuration from the node-agent-configmap. Since VGDP-MS pods (backupPod/restorePod) inherit their configurations from the node-agent, they will automatically use the priority class name specified in the node-agent-configmap.
+
+This ensures that all pods created by Velero for data movement operations (CSI snapshot data movement, PVB, and PVR) use a consistent approach for priority class name configuration through the node-agent-configmap.
+
+### How Exposers Receive Configuration
+
+CSI Snapshot Exposer and Generic Restore Exposer do not directly watch or read ConfigMaps. Instead, they receive configuration through their parent controllers:
+
+1. **Controller Initialization**: Controllers receive the priority class name as a parameter during initialization from the node-agent server.
+
+2. **Configuration Propagation**: During reconciliation of resources:
+   - The controller calls `setupExposeParam()` which includes the `dataMovePriorityClass` value
+   - For CSI operations: `CSISnapshotExposeParam.PriorityClassName` is set
+   - For generic restore: `GenericRestoreExposeParam.PriorityClassName` is set
+   - The controller passes these parameters to the exposer's `Expose()` method
+
+3. **Pod Creation**: The exposer creates pods with the priority class name provided by the controller.
+
+This design keeps exposers stateless and ensures:
+- Exposers remain simple and focused on pod creation
+- All configuration flows through controllers consistently
+- No complex state synchronization between components
+- Configuration changes require component restart to take effect
+
+## Open Issues
+
+None.
--- a/design/Implemented/repo_maintenance_job_config.md
+++ b/design/Implemented/repo_maintenance_job_config.md
@@ -4,7 +4,7 @@
 Add this design to make the repository maintenance job can read configuration from a dedicate ConfigMap and make the Job's necessary parts configurable, e.g. `PodSpec.Affinity` and `PodSpec.Resources`.

 ## Background
-Repository maintenance is split from the Velero server to a k8s Job in v1.14 by design [repository maintenance job](Implemented/repository-maintenance.md).
+Repository maintenance is split from the Velero server to a k8s Job in v1.14 by design [repository maintenance job](repository-maintenance.md).
 The repository maintenance Job configuration was read from the Velero server CLI parameter, and it inherits the most of Velero server's Deployment's PodSpec to fill un-configured fields.

 This design introduces a new way to let the user to customize the repository maintenance behavior instead of inheriting from the Velero server Deployment or reading from `velero server` CLI parameters.
@@ -13,7 +13,7 @@ It's possible new configurations are introduced in future releases based on this

 For the node selection, the repository maintenance Job also inherits from the Velero server deployment before, but the Job may last for a while and cost noneligible resources, especially memory.
 The users have the need to choose which k8s node to run the maintenance Job.
-This design reuses the data structure introduced by design [node-agent affinity configuration](Implemented/node-agent-affinity.md) to make the repository maintenance job can choose which node running on.
+This design reuses the data structure introduced by design [Velero Generic Data Path affinity configuration](node-agent-affinity.md) to make the repository maintenance job can choose which node running on.

 ## Goals
 - Unify the repository maintenance Job configuration at one place.
@@ -118,7 +118,7 @@ For example, the following BackupRepository's key should be `test-default-kopia`
    volumeNamespace: test
 ```

-The `LoadAffinity` structure is reused from design [node-agent affinity configuration](Implemented/node-agent-affinity.md).
+The `LoadAffinity` structure is reused from design [Velero Generic Data Path affinity configuration](node-agent-affinity.md).
 It's possible that the users want to choose nodes that match condition A or condition B to run the job.
 For example, the user want to let the nodes is in a specified machine type or the nodes locate in the us-central1-x zones to run the job.
 This can be done by adding multiple entries in the `LoadAffinity` array.
--- a/design/Implemented/vgdp-affinity-enhancement.md
+++ b/design/Implemented/vgdp-affinity-enhancement.md
@@ -0,0 +1,257 @@
+# Velero Generic Data Path Load Affinity Enhancement Design
+
+## Glossary & Abbreviation
+
+**Velero Generic Data Path (VGDP)**: VGDP is the collective modules that is introduced in [Unified Repository design][1]. Velero uses these modules to finish data transfer for various purposes (i.e., PodVolume backup/restore, Volume Snapshot Data Movement). VGDP modules include uploaders and the backup repository. 
+
+**Exposer**: Exposer is a module that is introduced in [Volume Snapshot Data Movement Design][1]. Velero uses this module to expose the volume snapshots to Velero node-agent pods or node-agent associated pods so as to complete the data movement from the snapshots.
+
+## Background
+
+The implemented [VGDP LoadAffinity design][3] already defined the a structure `LoadAffinity` in `--node-agent-configmap` parameter. The parameter is used to set the affinity of the backupPod of VGDP.
+
+There are still some limitations of this design:
+* The affinity setting is global. Say there are two StorageClasses and the underlying storage can only provision volumes to part of the cluster nodes. The supported nodes don't have intersection. Then the affinity will definitely not work in some cases.
+* The old design focuses on the backupPod affinity, but the restorePod also needs the affinity setting.
+
+As a result, create this design to address the limitations.
+
+## Goals
+
+- Enhance the node affinity of VGDP instances for volume snapshot data movement: add per StorageClass node affinity.
+- Enhance the node affinity of VGDP instances for volume snapshot data movement: support the or logic between affinity selectors.
+- Define the behaviors of node affinity of VGDP instances in node-agent for volume snapshot data movement restore, when the PVC restore doesn't require delay binding.
+
+## Non-Goals
+
+- It is also beneficial to support VGDP instances affinity for PodVolume backup/restore, this will be implemented after the PodVolume micro service completes.
+
+## Solution
+
+This design still uses the ConfigMap specified by `velero node-agent` CLI's parameter `--node-agent-configmap` to host the node affinity configurations.
+
+Upon the implemented [VGDP LoadAffinity design][3] introduced `[]*LoadAffinity` structure, this design add a new field `StorageClass`. This field is optional.
+* If the `LoadAffinity` element's `StorageClass` doesn't have value, it means this element is applied to global, just as the old design.
+* If the `LoadAffinity` element's `StorageClass` has value, it means this element is applied to the VGDP instances' PVCs use the specified StorageClass.
+* The `LoadAffinity` element whose `StorageClass` has value has higher priority than the `LoadAffinity` element whose `StorageClass` doesn't have value.
+
+
+```go
+type Configs struct {
+    // LoadConcurrency is the config for load concurrency per node.
+    LoadConcurrency *LoadConcurrency `json:"loadConcurrency,omitempty"`
+
+    // LoadAffinity is the config for data path load affinity.
+    LoadAffinity []*LoadAffinity `json:"loadAffinity,omitempty"`    
+}
+
+type LoadAffinity struct {
+    // NodeSelector specifies the label selector to match nodes
+    NodeSelector metav1.LabelSelector `json:"nodeSelector"`
+}
+```
+
+``` go
+type LoadAffinity struct {
+    // NodeSelector specifies the label selector to match nodes
+    NodeSelector metav1.LabelSelector `json:"nodeSelector"`
+
+    // StorageClass specifies the VGDPs the LoadAffinity applied to. If the StorageClass doesn't have value, it applies to all. If not, it applies to only the VGDPs that use this StorageClass.
+    StorageClass string `json:"storageClass"`
+}
+```
+
+### Decision Tree
+
+```mermaid
+flowchart TD
+    A[VGDP Pod Needs Scheduling] --> B{Is this a restore operation?}
+    
+    B -->|Yes| C{StorageClass has volumeBindingMode: WaitForFirstConsumer?}
+    B -->|No| D[Backup Operation]
+    
+    C -->|Yes| E{restorePVC.ignoreDelayBinding = true?}
+    C -->|No| F[StorageClass binding mode: Immediate]
+    
+    E -->|No| G[Wait for target Pod scheduling<br/>Use Pod's selected node<br/>⚠️ Affinity rules ignored]
+    E -->|Yes| H[Apply affinity rules<br/>despite WaitForFirstConsumer]
+    
+    F --> I{Check StorageClass in loadAffinity by StorageClass field}
+    H --> I
+    D --> J{Using backupPVC with different StorageClass?}
+    
+    J -->|Yes| K[Use final StorageClass<br/>for affinity lookup]
+    J -->|No| L[Use original PVC StorageClass<br/>for affinity lookup]
+    
+    K --> I
+    L --> I
+    
+    I -->|StorageClass found| N[Filter the LoadAffinity by <br/>the StorageClass<br/>🎯 and apply the LoadAffinity HIGHEST PRIORITY]
+    I -->|StorageClass not found| O{Check loadAffinity element without StorageClass field}
+
+    O -->|No loadAffinity configured| R[No affinity constraints<br/>Schedule on any available node<br/>🌐 DEFAULT]
+    
+    O --> V[Validate node-agent availability<br/>⚠️ Ensure node-agent pods exist on target nodes]
+    N --> V
+    
+    V --> W{Node-agent available on selected nodes?}
+    W -->|Yes| X[✅ VGDP Pod scheduled successfully]
+    W -->|No| Y[❌ Pod stays in Pending state<br/>Timeout after 30min<br/>Check node-agent DaemonSet coverage]
+    
+    R --> Z[Schedule on any node<br/>✅ Basic scheduling]
+    
+    %% Styling
+    classDef successNode fill:#d4edda,stroke:#155724,color:#155724
+    classDef warningNode fill:#fff3cd,stroke:#856404,color:#856404
+    classDef errorNode fill:#f8d7da,stroke:#721c24,color:#721c24
+    classDef priorityHigh fill:#e7f3ff,stroke:#0066cc,color:#0066cc
+    classDef priorityMedium fill:#f0f8ff,stroke:#4d94ff,color:#4d94ff
+    classDef priorityDefault fill:#f8f9fa,stroke:#6c757d,color:#6c757d
+    
+    class X,Z successNode
+    class G,V,Y warningNode
+    class Y errorNode
+    class N,T,U priorityHigh
+    class P,Q priorityMedium
+    class R priorityDefault
+```
+
+### Examples
+
+#### LoadAffinity interacts with LoadAffinityPerStorageClass
+
+``` json
+{
+    "loadAffinity": [
+        {
+            "nodeSelector": {
+                "matchLabels": {
+                    "beta.kubernetes.io/instance-type": "Standard_B4ms"
+                }
+            }
+        },
+        {
+            "nodeSelector": {
+                "matchExpressions": [
+                    {
+                        "key": "kubernetes.io/os",
+                        "values": [
+                            "linux"
+                        ],
+                        "operator": "In"
+                    }
+                ]
+            },
+            "storageClass": "kibishii-storage-class"
+        },
+        {
+            "nodeSelector": {
+                "matchLabels": {
+                    "beta.kubernetes.io/instance-type": "Standard_B8ms"
+                }
+            },
+            "storageClass": "kibishii-storage-class"
+        }
+    ]
+}
+```
+
+This sample demonstrates how the `loadAffinity` elements with `StorageClass` field and without `StorageClass` field setting work together.
+If the VGDP mounting volume is created from StorageClass `kibishii-storage-class`, its pod will run Linux nodes or instance type as `Standard_B8ms`.
+
+The other VGDP instances will run on nodes, which instance type is `Standard_B4ms`.
+
+#### LoadAffinity interacts with BackupPVC
+
+``` json
+{
+    "loadAffinity": [
+        {
+            "nodeSelector": {
+                "matchLabels": {
+                    "beta.kubernetes.io/instance-type": "Standard_B4ms"
+                }
+            },
+            "storageClass": "kibishii-storage-class"
+        },
+        {
+            "nodeSelector": {
+                "matchLabels": {
+                    "beta.kubernetes.io/instance-type": "Standard_B2ms"
+                }
+            },
+            "storageClass": "worker-storagepolicy"
+        }
+    ],
+    "backupPVC": {
+        "kibishii-storage-class": {
+            "storageClass": "worker-storagepolicy"
+        }
+    }
+}
+```
+
+Velero data mover supports to use different StorageClass to create backupPVC by [design](https://github.com/vmware-tanzu/velero/pull/7982).
+
+In this example, if the backup target PVC's StorageClass is `kibishii-storage-class`, its backupPVC should use StorageClass `worker-storagepolicy`. Because the final StorageClass is `worker-storagepolicy`, the backupPod uses the loadAffinity specified by `loadAffinity`'s elements with `StorageClass` field set to `worker-storagepolicy`. backupPod will be assigned to nodes, which instance type is `Standard_B2ms`.
+
+
+#### LoadAffinity interacts with RestorePVC
+
+``` json
+{
+    "loadAffinity": [
+        {
+            "nodeSelector": {
+                "matchLabels": {
+                    "beta.kubernetes.io/instance-type": "Standard_B4ms"
+                }
+            },
+            "storageClass": "kibishii-storage-class"
+        }
+    ],
+    "restorePVC": {
+        "ignoreDelayBinding": false
+    }
+}
+```
+
+##### StorageClass's bind mode is WaitForFirstConsumer
+
+``` yaml
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: kibishii-storage-class
+parameters:
+  svStorageClass: worker-storagepolicy
+provisioner: csi.vsphere.vmware.com
+reclaimPolicy: Delete
+volumeBindingMode: WaitForFirstConsumer
+```
+
+If restorePVC should be created from StorageClass `kibishii-storage-class`, and it's volumeBindingMode is `WaitForFirstConsumer`.
+Although `loadAffinityPerStorageClass` has a section matches the StorageClass, the `ignoreDelayBinding` is set `false`, the Velero exposer will wait until the target Pod scheduled to a node, and returns the node as SelectedNode for the restorePVC.
+As a result, the `loadAffinityPerStorageClass` will not take affect.
+
+##### StorageClass's bind mode is Immediate
+
+``` yaml
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: kibishii-storage-class
+parameters:
+  svStorageClass: worker-storagepolicy
+provisioner: csi.vsphere.vmware.com
+reclaimPolicy: Delete
+volumeBindingMode: Immediate
+```
+
+Because the StorageClass volumeBindingMode is `Immediate`, although `ignoreDelayBinding` is set to `false`, restorePVC will not be created according to the target Pod.
+
+The restorePod will be assigned to nodes, which instance type is `Standard_B4ms`.
+
+[1]: Implemented/unified-repo-and-kopia-integration/unified-repo-and-kopia-integration.md
+[2]: Implemented/volume-snapshot-data-movement/volume-snapshot-data-movement.md
+[3]: Implemented/node-agent-affinity.md
--- a/design/Implemented/vgdp-micro-service-for-fs-backup/vgdp-micro-service-for-fs-backup.md
+++ b/design/Implemented/vgdp-micro-service-for-fs-backup/vgdp-micro-service-for-fs-backup.md
@@ -0,0 +1,662 @@
+# VGDP Micro Service For fs-backup
+
+## Glossary & Abbreviation
+
+**VGDP**: Velero Generic Data Path. The collective modules that is introduced in [Unified Repository design][1]. Velero uses these modules to finish data transmission for various purposes. It includes uploaders and the backup repository.  
+**fs-backup**: Also known as pod volume backup (PVB)/pod volume restore (PVR). It is one of the primary backup methods built-in with Velero. It has been refactored in [Unified Repository design][1].  
+**PVB**: Pod Volume Backup, the internal name for backup part of fs-backup.  
+**PVR**: Pod Volume Restore, the internal name for restore part of fs-backup.  
+**Exposer**: Exposer is introduced in [Volume Snapshot Data Movement design][2] and is used to expose the volume snapshots/volumes for VGDP to access locally.  
+**VGDP MS**: VGDP Micro Service, it is introduced in [VGDP Micro Service For Volume Snapshot Data Movement][3]. It hosts VGDP instances in dedicated backup/restore pods, instead of in node-agent pods.   
+
+## Background
+As described in [VGDP Micro Service For Volume Snapshot Data Movement][3], hosting VGDP instances in dedicated pods has solved many major problems and brought significant improvements in scalability. These improvements are also effective for fs-backup. And besides the benefits listed in [VGDP Micro Service For Volume Snapshot Data Movement][3], we can also see below ones specifically for fs-backup:
+- This enables fs-backup to support Windows workloads. Windows doesn't support propagate mount, so the current fs-backup solution doesn't work for Windows nodes and Windows workloads. However, if the final host-path for the source volume is mounted to the VGDP MS pods, it should work.   
+- This enables fs-backup to reuse the existing VGDP features seamlessly, i.e., concurrency control, node selector, etc.
+
+By moving all VGDP instances out of node-agent pods, we would further get prepared for below important features and improvements:
+- NFS support: NFS volumes are mounted to VGDP MS pods, so node-agent pods don't need to restart when a new BSL is added.
+- Performance improvement for Kopia uploader restore ([#7725][9]): dedicated cache volumes could be mounted to the VGDP MS pods, without affecting node-agent pods.
+- Controllable resource usage for node-agent: node-agent pods are long running and so not suitable for data path activities as the OS usually reclaim memory in a lazy reclaim behavior, so the unused memory may be shown as occupied by node-agent pods, which misleads Kubernetes or other related sub system. After this change, node-agent pods no longer require large resource (CPU/memory) usage, so no obvious memory retain will be observed.
+- Simplify node-agent configuration: host-path mounts, root user and privileged mode are no longer required by node-agent; and the configuration differences of node-agent for linux and Windows nodes could be eliminated. 
+
+## Goals
+- Create a solution to make VGDP instances as micro services for fs-backup
+- Modify the fs-backup workflow to offload the VGDP work from node-agent to the VGDP MS
+- Create the mechanism for fs-backup to control and monitor the VGDP MS in various scenarios
+
+## Non-Goals
+- The current solution covers the VGDP Micro Service for fs-backup itself, the potentional features/improvements that rely on this solution will be covered by further designs and implementations.   
+
+
+## Overview
+The solution is based on [VGDP Micro Service For Volume Snapshot Data Movement][3], the architecture is followed as is and existing components are not changed unless it is necessary.  
+Below diagram shows how these components work together:  
+![vgdp-ms-1.png](vgdp-ms-1.png)  
+
+Below lists the changed components, why and how:  
+**Pod-Volume Exposer**: A new exposer, pod-volume exposer is added. It retrieves the host path of the specific volume and then creates the backupPod/restorePod and mounts the host path to the pod. The command of the backupPod/restorePod is also changed to launch VGDP MS for PVB/PVR.  
+**PVB/PVR Controller**: The PVB/PVR controllers are refactored to work with podVolume exposer, VGDP-MS, etc. The controllers will also support Cancel and resume. So PVB/PVR CRD is also refactored to support these scenarios.    
+**PVB/PVR VGDP-MS**: New commands for PVB/PVR VGDP-MS are added. The VGDP instances are started in the backupPod/restorePod as result of the commands.  
+
+The VGDP Watcher and its mechanism are fully reused.
+
+The [Node-agent concurrency][4] is reused to control the concurrency of VGDP MS for fs-backup. When there are too many volumes in the backup/restore, which takes too much computing resources(CPU, memory, etc.) or Kubernetes resources(pods, PVCs, PVs, etc.), users could set the concurrency in each node so as to control the total number of concurrent VGDP instances in the cluster.  
+
+## Detailed Design
+### Exposer
+As the old behavior, the host path (e.g., `/var/lib/kubelet/pods`) for the Kubernetes pods are mounted to node-agent pods, then the VGDP instances running in the same pods access the data through subdir of the host path for a specific volume, e.g.,  `/var/lib/kubelet/pods/<pod UID>/volumes/kubernetes.io~csi/<PVC name>/mount`. Therefore, a node-agent pod could access all volumes attached to the same node.  
+For the new implementation, the exposer retrieves the host path for a specific volume directly, and then mount that host path to the backupPod/restorePod. This also means that the backupPod/restorePod could only access the volume to be backed up or restored.    
+
+The exposer creates backupPod/restorePod and sets ```velero pod-volume``` as the command run by backupPod/restorePod. And `velero` image is used for the backupPod/restorePod.   
+There are sub commands varying from backup and restore:  
+```velero pod-volume backup --volume-path xxx --pod-volume-backup xxx --resource-timeout xxx --log-format xxx --log-level xxx```  
+Or:  
+```velero pod-volume restore --volume-path xxx --pod-volume-restore xxx --resource-timeout xxx --log-format xxx --log-level xxx```  
+
+Below are the parameters of the commands:  
+**volume-path**: Deliver the full path inside the backupPod/restorePod for the volume to be backed up/restored.    
+**pod-volume-backup**: PVB CR for this backup.  
+**pod-volume-restore**: PVR CR for this restore.  
+**resource-timeout**: resource-timeout is used to control the timeout for operations related to resources. It has the same meaning with the resource-timeout for node-agent.  
+**log-format** and **log-level**: This is to control the behavior of log generation inside VGDP-MS.  
+
+Below pod configurations are inherited from node-agent and set to backupPod/restorePod's spec:
+- Volumes: Some configMaps will be mapped as volumes to node-agent, so we add the same volumes of node-agent to the backupPod/restorePod
+- Environment Variables
+- Security Contexts  
+
+Since the volume data is still accessed by host path, the backupPod/restorePod may still need to run in Privileged mode in some environments. Therefore, the Privileged mode setting which is a part of Security Contexts will be inherited from node-agent.    
+The root user is still required, especially by the restore (in order to restore the file system attributes, owners, etc.), so we will use root user for backupPod/restorePod.  
+
+As same as [VGDP Micro Service For Volume Snapshot Data Movement][3], the backupPod/restorePods's ```RestartPolicy``` is set to ```RestartPolicyNever```, so that once VGDP-MS terminates for any reason, backupPod/restorePod won't restart and the PVB/PVR is marked as one of the terminal phases (Completed/Failed/Cancelled) accordingly.  
+
+### VGDP Watcher
+The VGDP watcher is fully reused, specifically, we still use the dual mode event watcher to watch the status change from backupPod/restorePod or the VGDP instance.  
+The AsyncBR adapter and its interface is also fully reused.  
+
+### VGDP-MS
+The VGDP-MS that is represented by ```velero pod-volume``` keeps the same workflow as [VGDP Micro Service For Volume Snapshot Data Movement][3]:  
+![vgdp-ms-2.png](vgdp-ms-2.png)  
+
+**Start DUCR/DDCR Watcher**: The same as [VGDP Micro Service For Volume Snapshot Data Movement][3], except that it watches PVB/PVR CRs.  
+**Wait DUCR/DDCR InProgress**: The same as The same as [VGDP Micro Service For Volume Snapshot Data Movement][3], VGDP-MS won't start the VGDP instance until PVB/PVR CR turns to ```InProgress```.  
+**Record VGDP Starts**: The same as [VGDP Micro Service For Volume Snapshot Data Movement][3].  
+**VGDP Callbacks**: The same as [VGDP Micro Service For Volume Snapshot Data Movement][3].  
+**Record VGDP Ends**: The same as [VGDP Micro Service For Volume Snapshot Data Movement][3].    
+**Record VGDP Progress**: The same as [VGDP Micro Service For Volume Snapshot Data Movement][3].  
+**Set VGDP Output**: The same as [VGDP Micro Service For Volume Snapshot Data Movement][3].  
+
+The return message for VGDP completion is also reused, except that `VolMode` is always set to `PersistentVolumeFilesystem`:  
+```
+type BackupResult struct {
+    SnapshotID    string              `json:"snapshotID"`
+    EmptySnapshot bool                `json:"emptySnapshot"`
+    Source        exposer.AccessPoint `json:"source,omitempty"`
+}
+```
+```
+type RestoreResult struct {
+    Target exposer.AccessPoint `json:"target,omitempty"`
+}
+```  
+``` 
+type AccessPoint struct {
+    ByPath  string                        `json:"byPath"`
+    VolMode uploader.PersistentVolumeMode `json:"volumeMode"`
+}
+``` 
+
+And the mechanism and data struct for Progress update is also reused:
+``` 
+type Progress struct {
+    TotalBytes int64 `json:"totalBytes,omitempty"`
+    BytesDone  int64 `json:"doneBytes,omitempty"`
+}
+```    
+
+### Log Collection
+The log collection mechanism is the same as [VGDP Micro Service For Volume Snapshot Data Movement][3].       
+
+### Resource Control
+The resource control mechanism is the same as [VGDP Micro Service For Volume Snapshot Data Movement][3].  
+
+### Restic Restore
+As the current Restic path deprecation process, restore is still supported. On the other hand, we don't want to support Restic path for this new VGDP MS implementation.  
+Therefore, the legacy PVR controller and workflow is preserved for Restic path restore.  The controller watches legacy PVRs only, and then launches the legacy workflow. Meawhile, the new PVR controller should skip legacy PVRs.  
+After Restic path is full deprecated, the code for the legacy controller and workflow should be removed.  
+
+### Velero Server Restarts
+The backup/restore stays in InProgress phase during the running of PVB/PVR, no phase changes between completion of item iteration and completion of PVB/PVR.   As a result, on Velero server restarts, there is no way to resume a backup/restore.  
+Therefore, the backup/restore will be be marked as Failed, which is the same as the old behavior. And it is still not as good as CSI snapshot data movement for which the backup/restore could be resumed as long as it has iterated all items.  
+By the meanwhile, there is indeed some improvements. As the old behavior, once the backup/restore is set as Failed on Velero server restart, the running PVB/PVR will be left there, as a result, the VGDP instances may run for a long time and take lots of resource for nothing; for the new implementation, PVB/PVR will be set as Cancel immediately after the backup/restore is set as Failed.  
+
+### node-agent Restarts
+As the old behavior, once a node-agent pod restarts, all the PVBs/PVRs running in the same node will be set as Failed as there is no way to resume the VGDP instances for them.  
+For the new implementation, since the VGDP instances run in dedicated backupPods/restorePods without affected, the PVBs/PVRs will be resumed after node-agent restarts. This includes PVBs/PVRs in all phases.  
+
+The legacy PVRs handling Restic restore are processed by the old workflow, so they will still be set as Failed on node-agent restart.  
+
+### Windows Support
+Windows nodes and workloads will be supported by following the same changes for CSI snapshot data movement as listed in [Velero Windows Support][7]. There are some additional changes particularly for PVB/PVR.   
+
+#### Restore Helper 
+PVR requires an init-container, called `restore-wait`, to run in the workload pod. There are default configurations for the container and users could customize them by the `pod-volume-restore` RIA plugin configMap.
+The `pod-volume-restore` RIA is used to config the init-container, so it should support Windows pods for all the configurations.  
+Meanwhile, the customized options in the configMap should also support Windows pods. If an option is not suitable for Windows pods, it will be ignored by the RIA.  
+
+By default, the init-container uses `velero` image with a binary called `velero-restore-helper` inside, so that binary should be compiled and assembled to the `velero` image for Windows.  
+
+#### Privileged mode
+Privileged pods are implemented by [HostProcess Pods][8] on Windows and need to be specially configured. And there are many constrains for it.  
+As one of the constrains, HostProcess pods supports Windows service accounts only. As a result, restore will not be able to support it until [#8423][10] is fixed, otherwise, the restored files are not usable by workloads which run under genneral container users, e.g., `containerUser` or `containerAdministrator`.  
+Therefore, as the current implementation, fs-backup will not support Windows workloads in the environments where Privileged mode is required. A limitation should be documented.  
+
+## node-agent
+node-agent is required to host the PVB/PVR controller which reconciles PVB/PVR and operates PVB/PVR in other steps before the VGDP instance is started, i.e., Accept, Expose, etc.  
+node-agent still requires host path mount because of two deprecating features [in-tree storage provider support deprecation][5] and [emptyDir volume support deprecation][6]. As a result, Privileged mode and root user are still required in some environments. Therefore, we will keep the node-agent deamonset as is, until the two deprecations complete.   
+
+## CRD Changes
+In order to support the VGDP MS workflow, some elements in the PVB/PVR CRDs are added or extended:
+- New phases are added for PVB/PVR: `PodVolumeBackupPhaseAccepted`, `PodVolumeBackupPhasePrepared`, `PodVolumeBackupPhaseCanceling`, `PodVolumeBackupPhaseCanceled`; `PodVolumeRestorePhaseAccepted`, `PodVolumeRestorePhasePrepared`, `PodVolumeRestorePhaseCanceling`, `PodVolumeRestorePhaseCanceled`.  
+- New fields are added to PVB/PVR spec to support cancel: `Cancel bool`
+- New fields are added to PVB/PVR spec to support the accept phase and processing: `AcceptedTimestamp *metav1.Time`
+- A new field, which records the node the PVR is running, is added to PVR Status: `Node string`
+
+New changes happen to Backup/Restore CRDs.  
+
+Below is the new PVB CRD:
+```yaml
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  name: podvolumebackups.velero.io
+spec:
+  group: velero.io
+  names:
+    kind: PodVolumeBackup
+    listKind: PodVolumeBackupList
+    plural: podvolumebackups
+    singular: podvolumebackup
+  scope: Namespaced
+  versions:
+  - additionalPrinterColumns:
+    - description: PodVolumeBackup status such as New/InProgress
+      jsonPath: .status.phase
+      name: Status
+      type: string
+    - description: Time duration since this PodVolumeBackup was started
+      jsonPath: .status.startTimestamp
+      name: Started
+      type: date
+    - description: Completed bytes
+      format: int64
+      jsonPath: .status.progress.bytesDone
+      name: Bytes Done
+      type: integer
+    - description: Total bytes
+      format: int64
+      jsonPath: .status.progress.totalBytes
+      name: Total Bytes
+      type: integer
+    - description: Name of the Backup Storage Location where this backup should be
+        stored
+      jsonPath: .spec.backupStorageLocation
+      name: Storage Location
+      type: string
+    - description: Time duration since this PodVolumeBackup was created
+      jsonPath: .metadata.creationTimestamp
+      name: Age
+      type: date
+    - description: Name of the node where the PodVolumeBackup is processed
+      jsonPath: .status.node
+      name: Node
+      type: string
+    - description: The type of the uploader to handle data transfer
+      jsonPath: .spec.uploaderType
+      name: Uploader
+      type: string
+    name: v1
+    schema:
+      openAPIV3Schema:
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: PodVolumeBackupSpec is the specification for a PodVolumeBackup.
+            properties:
+              backupStorageLocation:
+                description: |-
+                  BackupStorageLocation is the name of the backup storage location
+                  where the backup repository is stored.
+                type: string
+              cancel:
+                description: |-
+                  Cancel indicates request to cancel the ongoing PodVolumeBackup. It can be set
+                  when the PodVolumeBackup is in InProgress phase
+                type: boolean
+              node:
+                description: Node is the name of the node that the Pod is running
+                  on.
+                type: string
+              pod:
+                description: Pod is a reference to the pod containing the volume to
+                  be backed up.
+                properties:
+                  apiVersion:
+                    description: API version of the referent.
+                    type: string
+                  fieldPath:
+                    description: |-
+                      If referring to a piece of an object instead of an entire object, this string
+                      should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2].
+                      For example, if the object reference is to a container within a pod, this would take on a value like:
+                      "spec.containers{name}" (where "name" refers to the name of the container that triggered
+                      the event) or if no container name is specified "spec.containers[2]" (container with
+                      index 2 in this pod). This syntax is chosen only to have some well-defined way of
+                      referencing a part of an object.
+                    type: string
+                  kind:
+                    description: |-
+                      Kind of the referent.
+                      More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+                    type: string
+                  name:
+                    description: |-
+                      Name of the referent.
+                      More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                    type: string
+                  namespace:
+                    description: |-
+                      Namespace of the referent.
+                      More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/
+                    type: string
+                  resourceVersion:
+                    description: |-
+                      Specific resourceVersion to which this reference is made, if any.
+                      More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency
+                    type: string
+                  uid:
+                    description: |-
+                      UID of the referent.
+                      More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids
+                    type: string
+                type: object
+                x-kubernetes-map-type: atomic
+              repoIdentifier:
+                description: RepoIdentifier is the backup repository identifier.
+                type: string
+              tags:
+                additionalProperties:
+                  type: string
+                description: |-
+                  Tags are a map of key-value pairs that should be applied to the
+                  volume backup as tags.
+                type: object
+              uploaderSettings:
+                additionalProperties:
+                  type: string
+                description: |-
+                  UploaderSettings are a map of key-value pairs that should be applied to the
+                  uploader configuration.
+                nullable: true
+                type: object
+              uploaderType:
+                description: UploaderType is the type of the uploader to handle the
+                  data transfer.
+                enum:
+                - kopia
+                - ""
+                type: string
+              volume:
+                description: |-
+                  Volume is the name of the volume within the Pod to be backed
+                  up.
+                type: string
+            required:
+            - backupStorageLocation
+            - node
+            - pod
+            - repoIdentifier
+            - volume
+            type: object
+          status:
+            description: PodVolumeBackupStatus is the current status of a PodVolumeBackup.
+            properties:
+              acceptedTimestamp:
+                description: |-
+                  AcceptedTimestamp records the time the pod volume backup is to be prepared.
+                  The server's time is used for AcceptedTimestamp
+                format: date-time
+                nullable: true
+                type: string
+              completionTimestamp:
+                description: |-
+                  CompletionTimestamp records the time a backup was completed.
+                  Completion time is recorded even on failed backups.
+                  Completion time is recorded before uploading the backup object.
+                  The server's time is used for CompletionTimestamps
+                format: date-time
+                nullable: true
+                type: string
+              message:
+                description: Message is a message about the pod volume backup's status.
+                type: string
+              path:
+                description: Path is the full path within the controller pod being
+                  backed up.
+                type: string
+              phase:
+                description: Phase is the current state of the PodVolumeBackup.
+                enum:
+                - New
+                - Accepted
+                - Prepared
+                - InProgress
+                - Canceling
+                - Canceled
+                - Completed
+                - Failed
+                type: string
+              progress:
+                description: |-
+                  Progress holds the total number of bytes of the volume and the current
+                  number of backed up bytes. This can be used to display progress information
+                  about the backup operation.
+                properties:
+                  bytesDone:
+                    format: int64
+                    type: integer
+                  totalBytes:
+                    format: int64
+                    type: integer
+                type: object
+              snapshotID:
+                description: SnapshotID is the identifier for the snapshot of the
+                  pod volume.
+                type: string
+              startTimestamp:
+                description: |-
+                  StartTimestamp records the time a backup was started.
+                  Separate from CreationTimestamp, since that value changes
+                  on restores.
+                  The server's time is used for StartTimestamps
+                format: date-time
+                nullable: true
+                type: string
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources: {}
+```
+
+Below is the new PVR CRD:
+```yaml
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.16.5
+  name: podvolumerestores.velero.io
+spec:
+  group: velero.io
+  names:
+    kind: PodVolumeRestore
+    listKind: PodVolumeRestoreList
+    plural: podvolumerestores
+    singular: podvolumerestore
+  scope: Namespaced
+  versions:
+  - additionalPrinterColumns:
+    - description: PodVolumeRestore status such as New/InProgress
+      jsonPath: .status.phase
+      name: Status
+      type: string
+    - description: Time duration since this PodVolumeRestore was started
+      jsonPath: .status.startTimestamp
+      name: Started
+      type: date
+    - description: Completed bytes
+      format: int64
+      jsonPath: .status.progress.bytesDone
+      name: Bytes Done
+      type: integer
+    - description: Total bytes
+      format: int64
+      jsonPath: .status.progress.totalBytes
+      name: Total Bytes
+      type: integer
+    - description: Name of the Backup Storage Location where the backup data is stored
+      jsonPath: .spec.backupStorageLocation
+      name: Storage Location
+      type: string
+    - description: Time duration since this PodVolumeRestore was created
+      jsonPath: .metadata.creationTimestamp
+      name: Age
+      type: date
+    - description: Name of the node where the PodVolumeRestore is processed
+      jsonPath: .status.node
+      name: Node
+      type: string
+    - description: The type of the uploader to handle data transfer
+      jsonPath: .spec.uploaderType
+      name: Uploader Type
+      type: string
+    name: v1
+    schema:
+      openAPIV3Schema:
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            description: PodVolumeRestoreSpec is the specification for a PodVolumeRestore.
+            properties:
+              backupStorageLocation:
+                description: |-
+                  BackupStorageLocation is the name of the backup storage location
+                  where the backup repository is stored.
+                type: string
+              cancel:
+                description: |-
+                  Cancel indicates request to cancel the ongoing PodVolumeRestore. It can be set
+                  when the PodVolumeRestore is in InProgress phase
+                type: boolean
+              pod:
+                description: Pod is a reference to the pod containing the volume to
+                  be restored.
+                properties:
+                  apiVersion:
+                    description: API version of the referent.
+                    type: string
+                  fieldPath:
+                    description: |-
+                      If referring to a piece of an object instead of an entire object, this string
+                      should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2].
+                      For example, if the object reference is to a container within a pod, this would take on a value like:
+                      "spec.containers{name}" (where "name" refers to the name of the container that triggered
+                      the event) or if no container name is specified "spec.containers[2]" (container with
+                      index 2 in this pod). This syntax is chosen only to have some well-defined way of
+                      referencing a part of an object.
+                    type: string
+                  kind:
+                    description: |-
+                      Kind of the referent.
+                      More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+                    type: string
+                  name:
+                    description: |-
+                      Name of the referent.
+                      More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
+                    type: string
+                  namespace:
+                    description: |-
+                      Namespace of the referent.
+                      More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/
+                    type: string
+                  resourceVersion:
+                    description: |-
+                      Specific resourceVersion to which this reference is made, if any.
+                      More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency
+                    type: string
+                  uid:
+                    description: |-
+                      UID of the referent.
+                      More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids
+                    type: string
+                type: object
+                x-kubernetes-map-type: atomic
+              repoIdentifier:
+                description: RepoIdentifier is the backup repository identifier.
+                type: string
+              snapshotID:
+                description: SnapshotID is the ID of the volume snapshot to be restored.
+                type: string
+              sourceNamespace:
+                description: SourceNamespace is the original namespace for namespace
+                  mapping.
+                type: string
+              uploaderSettings:
+                additionalProperties:
+                  type: string
+                description: |-
+                  UploaderSettings are a map of key-value pairs that should be applied to the
+                  uploader configuration.
+                nullable: true
+                type: object
+              uploaderType:
+                description: UploaderType is the type of the uploader to handle the
+                  data transfer.
+                enum:
+                - kopia
+                - ""
+                type: string
+              volume:
+                description: Volume is the name of the volume within the Pod to be
+                  restored.
+                type: string
+            required:
+            - backupStorageLocation
+            - pod
+            - repoIdentifier
+            - snapshotID
+            - sourceNamespace
+            - volume
+            type: object
+          status:
+            description: PodVolumeRestoreStatus is the current status of a PodVolumeRestore.
+            properties:
+              acceptedTimestamp:
+                description: |-
+                  AcceptedTimestamp records the time the pod volume restore is to be prepared.
+                  The server's time is used for AcceptedTimestamp
+                format: date-time
+                nullable: true
+                type: string
+              completionTimestamp:
+                description: |-
+                  CompletionTimestamp records the time a restore was completed.
+                  Completion time is recorded even on failed restores.
+                  The server's time is used for CompletionTimestamps
+                format: date-time
+                nullable: true
+                type: string
+              message:
+                description: Message is a message about the pod volume restore's status.
+                type: string
+              node:
+                description: Node is name of the node where the pod volume restore
+                  is processed.
+                type: string
+              phase:
+                description: Phase is the current state of the PodVolumeRestore.
+                enum:
+                - New
+                - Accepted
+                - Prepared
+                - InProgress
+                - Canceling
+                - Canceled
+                - Completed
+                - Failed
+                type: string
+              progress:
+                description: |-
+                  Progress holds the total number of bytes of the snapshot and the current
+                  number of restored bytes. This can be used to display progress information
+                  about the restore operation.
+                properties:
+                  bytesDone:
+                    format: int64
+                    type: integer
+                  totalBytes:
+                    format: int64
+                    type: integer
+                type: object
+              startTimestamp:
+                description: |-
+                  StartTimestamp records the time a restore was started.
+                  The server's time is used for StartTimestamps
+                format: date-time
+                nullable: true
+                type: string
+            type: object
+        type: object
+    served: true
+    storage: true
+    subresources: {}
+```
+
+## Installation Changes
+No changes to installation, the backupPod/restorePod's configurations are either inherited from node-agent or retrieved from node-agent-configmap.  
+
+## Upgrade
+Upgrade is not impacted.  
+
+## CLI
+CLI is not changed.
+
+
+
+[1]: ../unified-repo-and-kopia-integration/unified-repo-and-kopia-integration.md
+[2]: ../volume-snapshot-data-movement/volume-snapshot-data-movement.md
+[3]: ../vgdp-micro-service/vgdp-micro-service.md
+[4]: ../node-agent-concurrency.md
+[5]: https://github.com/vmware-tanzu/velero/issues/8955
+[6]: https://github.com/vmware-tanzu/velero/issues/8956
+[7]: https://github.com/vmware-tanzu/velero/issues/8289
+[8]: https://kubernetes.io/docs/tasks/configure-pod-container/create-hostprocess-pod/
+[9]: https://github.com/vmware-tanzu/velero/issues/7725
+[10]: https://github.com/vmware-tanzu/velero/issues/8423
--- a/design/Implemented/vgdp-micro-service-for-fs-backup/vgdp-ms-1.png
+++ b/design/Implemented/vgdp-micro-service-for-fs-backup/vgdp-ms-1.png
--- a/design/Implemented/vgdp-micro-service-for-fs-backup/vgdp-ms-2.png
+++ b/design/Implemented/vgdp-micro-service-for-fs-backup/vgdp-ms-2.png
--- a/design/Implemented/volume-group-snapshot.md
+++ b/design/Implemented/volume-group-snapshot.md
@@ -0,0 +1,611 @@
+# Add Support for VolumeGroupSnapshots
+
+This proposal outlines the design and implementation plan for incorporating VolumeGroupSnapshot support into Velero. The enhancement will allow Velero to perform consistent, atomic snapshots of groups of Volumes using the new Kubernetes [VolumeGroupSnapshot API](https://kubernetes.io/blog/2024/12/18/kubernetes-1-32-volume-group-snapshot-beta/). This capability is especially critical for stateful applications that rely on multiple volumes to ensure data consistency, such as databases and analytics workloads.
+
+## Glossary & Abbreviation
+
+Terminology used in this document:
+- VGS: VolumeGroupSnapshot
+- VS: VolumeSnapshot
+- VGSC: VolumeGroupSnapshotContent
+- VSC: VolumeSnapshotContent
+- VGSClass: VolumeGroupSnapshotClass
+- VSClass: VolumeSnapshotClass
+
+## Background
+
+Velero currently enables snapshot-based backups on an individual Volume basis through CSI drivers. However, modern stateful applications often require multiple volumes for data, logs, and backups. This distributed data architecture increases the risk of inconsistencies when volumes are captured individually. Kubernetes has introduced the VolumeGroupSnapshot(VGS) API [(KEP-3476)](https://github.com/kubernetes/enhancements/pull/1551), which allows for the atomic snapshotting of multiple volumes in a coordinated manner. By integrating this feature, Velero can offer enhanced disaster recovery for multi-volume applications, ensuring consistency across all related data.
+
+## Goals
+- Ensure that multiple related volumes are snapshotted simultaneously, preserving consistency for stateful applications via VolumeGroupSnapshots(VGS) API.
+- Integrate VolumeGroupSnapshot functionality into Velero’s existing backup and restore workflows.
+- Allow users to opt in to volume group snapshots via specifying the group label.
+
+## Non-Goals
+- The proposal does not require a complete overhaul of Velero’s CSI integration, it will extend the current mechanism to support group snapshots.
+- No any changes pertaining to execution of Restore Hooks
+
+## High-Level Design
+
+### Backup workflow:
+#### Accept the label to be used for VGS from the user:
+  - Accept the label from the user, we will do this in 3 ways:
+    - Firstly, we will have a hard-coded default label key like `velero.io/volume-group-snapshot` that the users can directly use on their PVCs.
+    - Secondly, we will let the users override this default VGS label via a velero server arg, `--volume-group-nsaphot-label-key`, if needed.
+    - And Finally we will have the option to override the default label via Backup API spec, `backup.spec.volumeGroupSnapshotLabelKey`
+    - In all the instances, the VGS label key will be present on the backup spec, this makes the label key accessible to plugins during the execution of backup operation.
+  - This label will enable velero to filter the PVC to be included in the VGS spec.
+  - Users will have to label the PVCs before invoking the backup operation.
+  - This label would act as a group identifier for the PVCs to be grouped under a specific VGS.
+  - It will be used to collect the PVCs to be used for a particular instance of VGS object.  
+
+**Note:** 
+  - Modifying or adding VGS label on PVCs during an active backup operation may lead to unexpected or undesirable backup results. To avoid inconsistencies, ensure PVC labels remain unchanged throughout the backup execution.
+  - Label Key Precedence: When determining which label key to use for grouping PVCs into a VolumeGroupSnapshot, Velero applies overrides in the following order (highest to lowest):
+    - Backup API spec (`backup.spec.volumeGroupSnapshotLabelKey`)
+    - Server flag (`--volume-group-snapshot-label-key`)
+    - Built-in default (`velero.io/volume-group-snapshot`)
+
+    Whichever key wins this precedence is then injected into the Backup spec so that all Velero plugins can uniformly discover and use it during the backup execution.
+#### Changes to the Existing PVC ItemBlockAction plugin:
+  - Currently the PVC IBA plugin is applied to PVCs and adds the RelatedItems for the particular PVC into the ItemBlock.
+  - At first it checks whether the PVC is bound and VolumeName is non-empty.
+  - Then it adds the related PV under the list of relatedItems.
+  - Following on, the plugin adds the pods mounting the PVC as relatedItems.
+  - Now we need to extend this PVC IBA plugin to add the PVCs to be grouped for a particular VGS object, so that they are processed together under an ItemBlock by Velero.
+      - First we will check if the PVC that is being processed by the plugin has the user specified VGS label.
+      - If it is present then we will execute a List call in the namespace with the label as a matching criteria and see if this results in any PVCs (other than the current one).
+      - If there are PVCs matching the criteria then we add the PVCs to the relatedItems list.
+      - This helps in building the ItemBlock we need for VGS processing, i.e. we have the relevant pods and PVCs in the ItemBlock.
+
+**Note:** The ItemBlock to VGS relationship will not always be 1:1. There might be scenarios when the ItemBlock might have multiple VGS instances associated with it.
+Lets go over some ItemBlock/VGS scenarios that we might encounter and visualize them for clarity:
+1. Pod Mounts: Pod1 mounts both PVC1 and PVC2.  
+   Grouping: PVC1 and PVC2 share the same group label (group: A)  
+   ItemBlock: The item block includes Pod1, PVC1, and PVC2.  
+   VolumeGroupSnapshot (VGS): Because PVC1 and PVC2 are grouped together by their label, they trigger the creation of a single VGS (labeled with group: A).  
+
+```mermaid
+flowchart TD
+   subgraph ItemBlock
+   P1[Pod1]
+   PVC1[PVC1 group: A]
+   PVC2[PVC2 group: A]
+   end
+
+   P1 -->|mounts| PVC1
+   P1 -->|mounts| PVC2
+
+   PVC1 --- PVC2
+
+   PVC1 -- "group: A" --> VGS[VGS group: A]
+   PVC2 -- "group: A" --> VGS
+
+```   
+2. Pod Mounts: Pod1 mounts each of the four PVCs.  
+   Grouping:  
+     Group A: PVC1 and PVC2 share the same grouping label (group: A).  
+     Group B: PVC3 and PVC4 share the grouping label (group: B)   
+   ItemBlock: All objects (Pod1, PVC1, PVC2, PVC3, and PVC4) are collected into a single item block.   
+   VolumeGroupSnapshots:  
+      PVC1 and PVC2 (group A) point to the same VGS (VGS (group: A)).      
+      PVC3 and PVC4 (group B) point to a different VGS (VGS (group: B)).    
+```mermaid
+flowchart TD
+    subgraph ItemBlock
+        P1[Pod1]
+        PVC1[PVC1 group: A]
+        PVC2[PVC2 group: A]
+        PVC3[PVC3 group: B]
+        PVC4[PVC4 group: B]
+    end
+
+    %% Pod mounts all PVCs
+    P1 -->|mounts| PVC1
+    P1 -->|mounts| PVC2
+    P1 -->|mounts| PVC3
+    P1 -->|mounts| PVC4
+
+    %% Group A relationships: PVC1 and PVC2
+    PVC1 --- PVC2
+    PVC1 -- "group: A" --> VGS_A[VGS-A group: A]
+    PVC2 -- "group: A" --> VGS_A
+
+    %% Group B relationships: PVC3 and PVC4
+    PVC3 --- PVC4
+    PVC3 -- "group: B" --> VGS_B[VGS-B group: B]
+    PVC4 -- "group: B" --> VGS_B
+```
+
+3. Pod Mounts: Pod1 mounts both PVC1 and PVC2, Pod2 mounts PVC1 and PVC3.  
+   Grouping:   
+     Group A: PVC1 and PVC2  
+     Group B: PVC3  
+   ItemBlock: All objects-Pod1, Pod2, PVC1, PVC2, and PVC3, are collected into a single item block.  
+   VolumeGroupSnapshots:  
+     PVC1 and PVC2 (group A) point to the same VGS (VGS (group: A)).   
+     PVC3 (group B) point to a different VGS (VGS (group: B)).  
+```mermaid
+flowchart TD
+    subgraph ItemBlock
+        P1[Pod1]
+        P2[Pod2]
+        PVC1[PVC1 group: A]
+        PVC2[PVC2 group: A]
+        PVC3[PVC3 group: B]
+    end
+
+    %% Pod mount relationships
+    P1 -->|mounts| PVC1
+    P1 -->|mounts| PVC2
+    P2 -->|mounts| PVC1
+    P2 -->|mounts| PVC3
+
+    %% Grouping for Group A: PVC1 and PVC2 are grouped into VGS_A
+    PVC1 --- PVC2
+    PVC1 -- "Group A" --> VGS_A[VGS Group A]
+    PVC2 -- "Group A" --> VGS_A
+
+    %% Grouping for Group B: PVC3 grouped into VGS_B
+    PVC3 -- "Group B" --> VGS_B[VGS Group B]
+    
+```
+
+#### Updates to CSI PVC plugin:
+The CSI PVC plugin now supports obtaining a VolumeSnapshot (VS) reference for a PVC in three ways, and then applies common branching for datamover and non‑datamover workflows:
+
+- Scenario 1: PVC has a VGS label and no VS (created via the VGS workflow) exists for its volume group:
+    - Determine VGSClass: The plugin will pick `VolumeGroupSnapshotClass` by following the same tier based precedence as it does for individual `VolumeSnapshotClasses`:
+      - Default by Label: Use the one VGSClass labeled
+      ```yaml
+      metadata:
+        labels:
+        velero.io/csi-volumegroupsnapshot-class: "true"
+
+      ```
+      whose `spec.driver` matches the CSI driver used by the PVCs.
+      - Backup‑level Override: If the Backup CR has an annotation
+      ```yaml
+      metadata:
+        annotations:
+        velero.io/csi-volumegroupsnapshot-class_<driver>: <className>
+      ```
+      (with <driver> equal to the PVCs’ CSI driver), use that class.
+      - PVC‑level Override: Finally, if the PVC itself carries an annotation
+      ```yaml
+      metadata:
+        annotations:
+        velero.io/csi-volume-group-snapshot-class: <className>
+      ```
+      and that class exists, use it.
+      At each step, if the plugin finds zero or multiple matching classes, VGS creation is skipped and backup fails.
+    - Create VGS: The plugin creates a new VolumeGroupSnapshot (VGS) for the PVC’s volume group. This action automatically triggers creation of the corresponding VGSC, VS, and VSC objects.
+    - Wait for VS Status: The plugin waits until each VS (one per PVC in the group) has its `volumeGroupSnapshotName` populated. This confirms that the snapshot controller has completed its work. `CSISnapshotTimeout` will be used here.
+    - Update VS Objects: Once the VS objects are provisioned, the plugin updates them by removing VGS owner references and VGS-related finalizers, and by adding backup metadata labels (including BackupName, BackupUUID, and PVC name). These labels are later used to detect an existing VS when processing another PVC of the same group.
+    - Patch and Cleanup: The plugin patches the deletionPolicy of the VGSC to "Retain" (ensuring that deletion of the VGSC does not remove the underlying VSC objects or storage snapshots) and then deletes the temporary VGS and VGSC objects.
+        
+- Scenario 2: PVC has a VGS label and a VS created via an earlier VGS workflow already exists:
+    - The plugin lists VS objects in the PVC’s namespace using backup metadata labels (BackupUID, BackupName, and PVCName).
+    - It verifies that at least one VS has a non‑empty `volumeGroupSnapshotName` in its status.
+    - If such a VS exists, the plugin skips creating a new VGS (or VS) and proceeds with the legacy workflow using the existing VS.
+    - If a VS is found but its status does not indicate it was created by the VGS workflow (i.e. its `volumeGroupSnapshotName` is empty), the backup for that PVC is failed, resulting in a partially failed backup.
+- Scenario 3: PVC does not have a VGS label:
+    - The legacy workflow is followed, and an individual VolumeSnapshot (VS) is created for the PVC.
+- Common Branching for Datamover and Non‑datamover Workflows:
+    - Once a VS reference (`vsRef`) is determined—whether through the VGS workflow (Scenario 1 or 2) or the legacy workflow (Scenario 3)—the plugin then applies the common branching:
+        - Non‑datamover Case: The VS reference is directly added as an additional backup item.
+            
+        - Datamover Case: The plugin waits until the VS’s associated VSC snapshot handle is ready (using the configured CSISnapshotTimeout), then creates a DataUpload for the VS–PVC pair. The resulting DataUpload is then added as an additional backup item.
+
+
+```mermaid
+flowchart TD
+    %% Section 1: Accept VGS Label from User
+    subgraph Accept_Label
+      A1[User sets VGS label key using default velero.io/volume-group-snapshot or via server arg or Backup API spec]
+      A2[User labels PVCs before backup]
+      A1 --> A2
+    end
+
+    %% Section 2: PVC ItemBlockAction Plugin Extension
+    subgraph PVC_ItemBlockAction
+      B1[Check PVC is bound and has VolumeName]
+      B2[Add related PV to relatedItems]
+      B3[Add pods mounting PVC to relatedItems]
+      B4[Check if PVC has user-specified VGS label]
+      B5[List PVCs in namespace matching label criteria]
+      B6[Add matching PVCs to relatedItems]
+      B1 --> B2 --> B3 --> B4
+      B4 -- Yes --> B5
+      B5 --> B6
+    end
+
+    %% Section 3: CSI PVC Plugin Updates
+    subgraph CSI_PVC_Plugin
+       C1[For each PVC, check for VGS label]
+       C1 -- Has VGS label --> C2[Determine scenario]
+       C1 -- No VGS label --> C16[Scenario 3: Legacy workflow - create individual VS]
+
+       %% Scenario 1: No existing VS via VGS exists
+       subgraph Scenario1[Scenario 1: No existing VS via VGS]
+         S1[List grouped PVCs using VGS label]
+         S2[Determine CSI driver for grouped PVCs]
+         S3[If single CSI driver then select matching VGSClass; else fail backup]
+         S4[Create new VGS triggering VGSC, VS, and VSC creation]
+         S5[Wait for VS objects to have nonempty volumeGroupSnapshotName]
+         S6[Update VS objects; remove VGS owner refs and finalizers; add backup metadata labels]
+         S7[Patch VGSC deletionPolicy to Retain]
+         S8[Delete transient VGS and VGSC]
+         S1 --> S2 --> S3 --> S4 --> S5 --> S6 --> S7 --> S8
+		 
+       end
+
+       %% Scenario 2: Existing VS via VGS exists
+       subgraph Scenario2[Scenario 2: Existing VS via VGS exists]
+         S9[List VS objects using backup metadata - BackupUID, BackupName, PVCName]
+         S10[Check if any VS has nonempty volumeGroupSnapshotName]
+         S9 --> S10
+         S10 -- Yes --> S11[Use existing VS]
+         S10 -- No --> S12[Fail backup for PVC]
+       end
+
+       C2 -- Scenario1 applies --> S1
+       C2 -- Scenario2 applies --> S9
+
+       %% Common Branch: After obtaining a VS reference
+       subgraph Common_Branch[Common Branch]
+         CB1[Obtain VS reference as vsRef]
+         CB2[If non-datamover, add vsRef as additional backup item]
+         CB3[If datamover, wait for VSC handle and create DataUpload; add DataUpload as additional backup item]
+         CB1 --> CB2
+         CB1 --> CB3
+       end
+       
+       %% Connect Scenario outcomes and legacy branch to the common branch
+       S8 --> CB1
+       S11 --> CB1
+       C16 --> CB1
+    end
+
+    %% Overall Flow Connections
+    A2 --> B1
+    B6 --> C1
+
+```
+
+
+Restore workflow:
+
+- No changes required for the restore workflow.
+
+## Detailed Design
+
+Backup workflow:
+- Accept the label to be used for VGS from the user as a server argument:
+    - Set a default VGS label key to be used:
+    ```go
+    // default VolumeGroupSnapshot Label
+	defaultVGSLabelKey = "velero.io/volume-group-snapshot"
+    
+    ```
+    - Add this as a server flag and pass it to backup reconciler, so that we can use it during the backup request execution.
+    ```go
+    flags.StringVar(&c.DefaultVGSLabelKey, "volume-group-snapshot-label-key", c.DefaultVGSLabelKey, "Label key for grouping PVCs into VolumeGroupSnapshot")
+    ```
+
+    - Update the Backup CRD to accept the VGS Label Key as a spec value:
+    ```go
+    // VolumeGroupSnapshotLabelKey specifies the label key to be used for grouping the PVCs under
+	// an instance of VolumeGroupSnapshot, if left unspecified velero.io/volume-group-snapshot is used
+	// +optional
+	VolumeGroupSnapshotLabelKey string `json:"volumeGroupSnapshotLabelKey,omitempty"`
+    ```
+    - Modify the [`prepareBackupRequest` function](https://github.com/openshift/velero/blob/8c8a6cccd78b78bd797e40189b0b9bee46a97f9e/pkg/controller/backup_controller.go#L327) to set the default label key as a backup spec if the user does not specify any value:
+    ```go
+    if len(request.Spec.VolumeGroupSnapshotLabelKey) == 0 {
+		// set the default key value
+		request.Spec.VolumeGroupSnapshotLabelKey = b.defaultVGSLabelKey
+	}
+    ```
+
+- Changes to the Existing [PVC ItemBlockAction plugin](https://github.com/vmware-tanzu/velero/blob/512199723ff95d5016b32e91e3bf06b65f57d608/pkg/itemblock/actions/pvc_action.go#L64) (Update the GetRelatedItems function):
+```go
+// Retrieve the VGS label key from the Backup spec.
+	vgsLabelKey := backup.Spec.VolumeGroupSnapshotLabelKey
+	if vgsLabelKey != "" {
+		// Check if the PVC has the specified VGS label.
+		if groupID, ok := pvc.Labels[vgsLabelKey]; ok {
+			// List all PVCs in the namespace with the same label key and value (i.e. same group).
+			pvcList := new(corev1api.PersistentVolumeClaimList)
+			if err := a.crClient.List(context.Background(), pvcList, crclient.InNamespace(pvc.Namespace), crclient.MatchingLabels{vgsLabelKey: groupID}); err != nil {
+				return nil, errors.Wrap(err, "failed to list PVCs for VGS grouping")
+			}
+			// Add each matching PVC (except the current one) to the relatedItems.
+			for _, groupPVC := range pvcList.Items {
+				if groupPVC.Name == pvc.Name {
+					continue
+				}
+				a.log.Infof("Adding grouped PVC %s to relatedItems for PVC %s", groupPVC.Name, pvc.Name)
+				relatedItems = append(relatedItems, velero.ResourceIdentifier{
+					GroupResource: kuberesource.PersistentVolumeClaims,
+					Namespace:     groupPVC.Namespace,
+					Name:          groupPVC.Name,
+				})
+			}
+		}
+	} else {
+		a.log.Info("No VolumeGroupSnapshotLabelKey provided in backup spec; skipping PVC grouping")
+	}
+```
+
+- Updates to [CSI PVC plugin](https://github.com/vmware-tanzu/velero/blob/512199723ff95d5016b32e91e3bf06b65f57d608/pkg/backup/actions/csi/pvc_action.go#L200) (Update the Execute method):
+```go
+func (p *pvcBackupItemAction) Execute(
+    item runtime.Unstructured,
+    backup *velerov1api.Backup,
+) (
+    runtime.Unstructured,
+    []velero.ResourceIdentifier,
+    string,
+    []velero.ResourceIdentifier,
+    error,
+) {
+    p.log.Info("Starting PVCBackupItemAction")
+
+    // Validate backup policy and PVC/PV
+    if valid := p.validateBackup(*backup); !valid {
+        return item, nil, "", nil, nil
+    }
+
+    var pvc corev1api.PersistentVolumeClaim
+    if err := runtime.DefaultUnstructuredConverter.FromUnstructured(item.UnstructuredContent(), &pvc); err != nil {
+        return nil, nil, "", nil, errors.WithStack(err)
+    }
+    if valid, item, err := p.validatePVCandPV(pvc, item); !valid {
+        if err != nil {
+            return nil, nil, "", nil, err
+        }
+        return item, nil, "", nil, nil
+    }
+
+    shouldSnapshot, err := volumehelper.ShouldPerformSnapshotWithBackup(
+        item,
+        kuberesource.PersistentVolumeClaims,
+        *backup,
+        p.crClient,
+        p.log,
+    )
+    if err != nil {
+        return nil, nil, "", nil, err
+    }
+    if !shouldSnapshot {
+        p.log.Debugf("CSI plugin skip snapshot for PVC %s according to VolumeHelper setting", pvc.Namespace+"/"+pvc.Name)
+        return nil, nil, "", nil, nil
+    }
+
+    var additionalItems []velero.ResourceIdentifier
+    var operationID string
+    var itemToUpdate []velero.ResourceIdentifier
+
+    // vsRef will be our common reference to the VolumeSnapshot (VS)
+    var vsRef *corev1api.ObjectReference
+
+    // Retrieve the VGS label key from the backup spec.
+    vgsLabelKey := backup.Spec.VolumeGroupSnapshotLabelKey
+
+    // Check if the PVC has the user-specified VGS label.
+    if group, ok := pvc.Labels[vgsLabelKey]; ok && group != "" {
+        p.log.Infof("PVC %s has VGS label with group %s", pvc.Name, group)
+        // --- VGS branch ---
+        // 1. Check if a VS created via a VGS workflow exists for this PVC.
+        existingVS, err := p.findExistingVSForBackup(backup.UID, backup.Name, pvc.Name, pvc.Namespace)
+        if err != nil {
+            return nil, nil, "", nil, err
+        }
+        if existingVS != nil && existingVS.Status.VolumeGroupSnapshotName != "" {
+            p.log.Infof("Existing VS %s found for PVC %s in group %s; skipping VGS creation", existingVS.Name, pvc.Name, group)
+            vsRef = &corev1api.ObjectReference{
+                Namespace: existingVS.Namespace,
+                Name:      existingVS.Name,
+            }
+        } else {
+            // 2. No existing VS via VGS; execute VGS creation workflow.
+            groupedPVCs, err := p.listGroupedPVCs(backup, pvc.Namespace, vgsLabelKey, group)
+            if err != nil {
+                return nil, nil, "", nil, err
+            }
+            pvcNames := extractPVCNames(groupedPVCs)
+            // Determine the CSI driver used by the grouped PVCs.
+            driver, err := p.determineCSIDriver(groupedPVCs)
+            if err != nil {
+                return nil, nil, "", nil, errors.Wrap(err, "failed to determine CSI driver for grouped PVCs")
+            }
+            if driver == "" {
+                return nil, nil, "", nil, errors.New("multiple CSI drivers found for grouped PVCs; failing backup")
+            }
+            // Retrieve the appropriate VGSClass for the CSI driver.
+            vgsClass := p.getVGSClassForDriver(driver)
+            p.log.Infof("Determined CSI driver %s with VGSClass %s for PVC group %s", driver, vgsClass, group)
+
+            newVGS, err := p.createVolumeGroupSnapshot(backup, pvc, pvcNames, vgsLabelKey, group, vgsClass)
+            if err != nil {
+                return nil, nil, "", nil, err
+            }
+            p.log.Infof("Created new VGS %s for PVC group %s", newVGS.Name, group)
+            
+            // Wait for the VS objects created via VGS to have volumeGroupSnapshotName in status.
+            if err := p.waitForVGSAssociatedVS(newVGS, pvc.Namespace, backup.Spec.CSISnapshotTimeout.Duration); err != nil {
+                return nil, nil, "", nil, err
+            }
+            // Update the VS objects: remove VGS owner references and finalizers; add backup metadata labels.
+            if err := p.updateVGSCreatedVS(newVGS, backup); err != nil {
+                return nil, nil, "", nil, err
+            }
+            // Patch the VGSC deletionPolicy to Retain.
+            if err := p.patchVGSCDeletionPolicy(newVGS, pvc.Namespace); err != nil {
+                return nil, nil, "", nil, err
+            }
+            // Delete the VGS and VGSC
+            if err := p.deleteVGSAndVGSC(newVGS, pvc.Namespace); err != nil {
+                return nil, nil, "", nil, err
+            }
+            // Fetch the VS that was created for this PVC via VGS.
+            vs, err := p.getVSForPVC(backup, pvc, vgsLabelKey, group)
+            if err != nil {
+                return nil, nil, "", nil, err
+            }
+            vsRef = &corev1api.ObjectReference{
+                Namespace: vs.Namespace,
+                Name:      vs.Name,
+            }
+        }
+    } else {
+        // Legacy workflow: PVC does not have a VGS label; create an individual VS.
+        vs, err := p.createVolumeSnapshot(pvc, backup)
+        if err != nil {
+            return nil, nil, "", nil, err
+        }
+        vsRef = &corev1api.ObjectReference{
+            Namespace: vs.Namespace,
+            Name:      vs.Name,
+        }
+    }
+
+    // --- Common Branch ---
+    // Now we have vsRef populated from one of the above cases.
+    // Branch further based on backup.Spec.SnapshotMoveData.
+    if boolptr.IsSetToTrue(backup.Spec.SnapshotMoveData) {
+        // Datamover case:
+        operationID = label.GetValidName(
+            string(velerov1api.AsyncOperationIDPrefixDataUpload) + string(backup.UID) + "." + string(pvc.UID),
+        )
+        dataUploadLog := p.log.WithFields(logrus.Fields{
+            "Source PVC":     fmt.Sprintf("%s/%s", pvc.Namespace, pvc.Name),
+            "VolumeSnapshot": fmt.Sprintf("%s/%s", vsRef.Namespace, vsRef.Name),
+            "Operation ID":   operationID,
+            "Backup":         backup.Name,
+        })
+        // Retrieve the current VS using vsRef
+        vs := &snapshotv1api.VolumeSnapshot{}
+        if err := p.crClient.Get(context.TODO(), crclient.ObjectKey{Namespace: vsRef.Namespace, Name: vsRef.Name}, vs); err != nil {
+            return nil, nil, "", nil, errors.Wrapf(err, "failed to get VolumeSnapshot %s", vsRef.Name)
+        }
+        // Wait until the VS-associated VSC snapshot handle is ready.
+        _, err := csi.WaitUntilVSCHandleIsReady(
+            vs,
+            p.crClient,
+            p.log,
+            true,
+            backup.Spec.CSISnapshotTimeout.Duration,
+        )
+        if err != nil {
+            dataUploadLog.Errorf("Failed to wait for VolumeSnapshot to become ReadyToUse: %s", err.Error())
+            csi.CleanupVolumeSnapshot(vs, p.crClient, p.log)
+            return nil, nil, "", nil, errors.WithStack(err)
+        }
+        dataUploadLog.Info("Starting data upload of backup")
+        dataUpload, err := createDataUpload(
+            context.Background(),
+            backup,
+            p.crClient,
+            vs,
+            &pvc,
+            operationID,
+        )
+        if err != nil {
+            dataUploadLog.WithError(err).Error("Failed to submit DataUpload")
+            if deleteErr := p.crClient.Delete(context.TODO(), vs); deleteErr != nil && !apierrors.IsNotFound(deleteErr) {
+                dataUploadLog.WithError(deleteErr).Error("Failed to delete VolumeSnapshot")
+            }
+            return item, nil, "", nil, nil
+        }
+        dataUploadLog.Info("DataUpload submitted successfully")
+        itemToUpdate = []velero.ResourceIdentifier{
+            {
+                GroupResource: schema.GroupResource{
+                    Group:    "velero.io",
+                    Resource: "datauploads",
+                },
+                Namespace: dataUpload.Namespace,
+                Name:      dataUpload.Name,
+            },
+        }
+        annotations[velerov1api.DataUploadNameAnnotation] = dataUpload.Namespace + "/" + dataUpload.Name
+        // For the datamover case, add the dataUpload as an additional item directly.
+        vsRef = &corev1api.ObjectReference{
+            Namespace: dataUpload.Namespace,
+            Name:      dataUpload.Name,
+        }
+        additionalItems = append(additionalItems, velero.ResourceIdentifier{
+            GroupResource: schema.GroupResource{
+                Group:    "velero.io",
+                Resource: "datauploads",
+            },
+            Namespace: dataUpload.Namespace,
+            Name:      dataUpload.Name,
+        })
+    } else {
+        // Non-datamover case:
+        // Use vsRef for snapshot purposes.
+        additionalItems = append(additionalItems, convertVSToResourceIdentifiersFromRef(vsRef)...)
+        p.log.Infof("VolumeSnapshot additional item added for VS %s", vsRef.Name)
+    }
+
+    // Update PVC metadata with common labels and annotations.
+    labels := map[string]string{
+        velerov1api.VolumeSnapshotLabel: vsRef.Name,
+        velerov1api.BackupNameLabel:     backup.Name,
+    }
+    annotations := map[string]string{
+        velerov1api.VolumeSnapshotLabel:                 vsRef.Name,
+        velerov1api.MustIncludeAdditionalItemAnnotation: "true",
+    }
+    kubeutil.AddAnnotations(&pvc.ObjectMeta, annotations)
+    kubeutil.AddLabels(&pvc.ObjectMeta, labels)
+
+    p.log.Infof("Returning from PVCBackupItemAction with %d additionalItems to backup", len(additionalItems))
+    for _, ai := range additionalItems {
+        p.log.Debugf("%s: %s", ai.GroupResource.String(), ai.Name)
+    }
+
+    pvcMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(&pvc)
+    if err != nil {
+        return nil, nil, "", nil, errors.WithStack(err)
+    }
+
+    return &unstructured.Unstructured{Object: pvcMap},
+        additionalItems, operationID, itemToUpdate, nil
+}
+
+
+```
+
+## Implementation
+
+This design proposal is targeted for velero 1.16.
+
+The implementation of this proposed design is targeted for velero 1.17.
+
+**Note:**
+- VGS support isn't a requirement on restore. The design does not have any VGS related elements/considerations in the restore workflow.
+
+## Requirements and Assumptions
+- Kubernetes Version:
+  - Minimum: v1.32.0 or later, since the VolumeGroupSnapshot API goes beta in 1.32.
+  - Assumption: CRDs for `VolumeGroupSnapshot`, `VolumeGroupSnapshotClass`, and `VolumeGroupSnapshotContent` are already installed.
+
+- VolumeGroupSnapshot API Availability:
+  - If the VGS API group (`groupsnapshot.storage.k8s.io/v1beta1`) is not present, Velero backup will fail.
+
+- CSI Driver Compatibility
+  - Only CSI drivers that implement the VolumeGroupSnapshot admission and controller support this feature.
+  - Upon VGS creation, we assume the driver will atomically snapshot all matching PVCs; if it does not, the plugin may time out.
+
+## Performance Considerations
+- Use VGS if you have many similar volumes that must be snapped together and you want to minimize API/server load.
+- Use individual VS if you have only a few volumes, or want one‐volume failures to be isolated.
+
+## Testing Strategy
+
+- Unit tests: We will add targeted unit tests to cover all new code paths—including existing-VS detection, VGS creation, legacy VS fallback, and error scenarios.
+- E2E tests: For E2E we would need, a Kind cluster with a CSI driver that supports group snapshots, deploy an application with multiple PVCs, execute a Velero backup and restore, and verify that VGS is created, all underlying VS objects reach ReadyToUse, and every PVC is restored successfully.
--- a/design/backup-repo-cache-volume.md
+++ b/design/backup-repo-cache-volume.md
@@ -0,0 +1,231 @@
+# Backup Repository Cache Volume Design
+
+## Glossary & Abbreviation
+
+**Backup Storage**: The storage to store the backup data. Check [Unified Repository design][1] for details.  
+**Backup Repository**: Backup repository is layered between BR data movers and Backup Storage to provide BR related features that is introduced in [Unified Repository design][1].  
+**Velero Generic Data Path (VGDP)**: VGDP is the collective of modules that is introduced in [Unified Repository design][1]. Velero uses these modules to finish data transfer for various purposes (i.e., PodVolume backup/restore, Volume Snapshot Data Movement). VGDP modules include uploaders and the backup repository.  
+**Data Mover Pods**: Intermediate pods which hold VGDP and complete the data transfer. See [VGDP Micro Service for Volume Snapshot Data Movement][2] and [VGDP Micro Service For fs-backup][3] for details.  
+**Repository Maintenance Pods**: Pods for [Repository Maintenance Jobs][4], which holds VGDP to run repository maintenance.    
+
+## Background
+
+According to the [Unified Repository design][1] Velero uses selectable backup repositories for various backup/restore methods, i.e., fs-backup, volume snapshot data movement, etc. Some backup repositories may need to cache data on the client side for various repository operation, so as to accelerate the execution.  
+In the existing [Backup Repository Configuration][5], we allow users to configure the cache data size (`cacheLimitMB`). However, the cache data is still stored in the root file system of data mover pods/repository maintenance pods, so stored in the root file system of the node. This is not good enough, reasons:  
+- In many distributions, the node's system disk size is predefined, non configurable and limit, e.g., the system disk size may be 20G or less
+- Velero supports concurrent data movements in each node. The cache in each of the concurrent data mover pods could quickly run out of the system disk and cause problems like pod eviction, failure of pod creation, degradation of Kubernetes QoS, etc.  
+
+We need to allow users to prepare a dedicated location, e.g., a dedictated volume, for the cache.  
+Not all backup repositories or not all backup repository operations require cache, we need to define the details when and how the cache is used.  
+
+## Goals
+
+- Create a mechanism for users to configure cache volumes for various pods running VGDP
+- Design the workflow to assign the cache volume pod path to backup repositories
+- Describe when and how the cache volume is used 
+
+## Non-Goals
+
+- The solution is based on [Unified Repository design][1], [VGDP Micro Service for Volume Snapshot Data Movement][2] and [VGDP Micro Service For fs-backup][3], legacy data paths are not supported. E.g., when a pod volume restore (PVR) runs with legacy Restic path, if any data is cached, the cache still resides in the root file system.  
+
+## Solution
+
+### Cache Data
+
+Varying on backup repositoires, cache data may include payload data or repository metadata, e.g., indexes to the payload data chunks.  
+
+Payload data is highly related to the backup data, and normally take the majority of the repository data as well as the cache data.
+
+Repository metadata is related to the backup repository's chunking algorithm, data chunk mapping method, etc, and so the size is not proportional to the backup data size.  
+On the other hand for some backup repository, in extreme cases, the repository metadata may be significantly large. E.g., Kopia's indexes are per chunks, if there are huge number of small files in the repository, Kopia's index data may be in the same level of or even larger than the payload data.    
+However, in the cases that repository metadata data become the majority, other bottlenecks may emerge and concurrency of data movers may be significantly constrained, so the requirement to cache volumes may go away.  
+
+Therefore, for now we only consider the cache volume requirement for payload data, and leave the consideration for metadata as a future enhancement.  
+
+### Scenarios
+
+Backup repository cache varies on backup repositories and backup repository operation during VGDP runs. Below are the scenarios when VGDP runs:
+- Data Upload for Backup: this is the process to upload/write the backup data into the backup repository, e.g., DataUpload or PodVolumeBackup. The pieces of data is almost directly written to the repository, sometimes with a small group staying shortly in the local place. That is to say, there should not be large scale data cached for this scenario, so we don't prepare dedicated cache for this scenario.
+- Repository Maintenance: Repository maintenance most often visits the backup repository's metadata and sometimes it needs to visit the file system directories from the backed up data. On the other hand, it is not practical to run concurrent maintenance jobs in one node. So the cache data is neither large nor affect the root file system too much. Therefore, we don't need to prepare dedicated cache for this scenario.
+- Data Download for Restore: this is the process to download/read the backup data from the backup repository during restore, e.g., DataDownload or PodVolumeRestore. For backup repositories for which data are stored in remote backup storages (e.g., Kopia repository stores data in remote object stores), large scale of data are cached locally to accerlerate the restore. Therefore, we need dedicate cache volumes for this scenario.  
+- Backup Deletion: During this scenario, backup repository is connected, metadata is enumerated to find the repository snapshot representing the backup data. That is to say, only metadata is cached if any. Therefore, dedicated cache volumes are not required in this scenario.
+
+The above analyses are based on the common behavior of backup repositories and they are not considering the case that backup repository metadata takes majority or siginficant proportion of the cache data.   
+As a conclusion of the analyses, we will create dedicated cache volumes for restore scenarios.  
+For other scenarios, we can add them regarded to the future changes/requirements. The mechanism to expose and connect the cache volumes should work for all scenarios. E.g., if we need to consider the backup repository metadata case, we may need cache volumes for backup and repository maintenance as well, then we can just reuse the same cache volume provision and connection mechanism to backup and repository maintenance scenarios.   
+
+### Cache Data and Lifecycle
+
+If available, one cache volume is dedicately assigned to one data mover pod. That is, the cached data is destroyed when the data mover pod completes. Then the backup repository instance also closes.    
+Cache data are fully managed by the specific backup repository. So the backup repository may also have its own way to GC the cache data.  
+That is to say, cache data GC may be launched by the backup repository instance during the running of the data mover pod; then the left data are automatically destroyed when the data mover pod and the cache PVC are destroyed (cache PVC's `reclaimPolicy` is always `Deleted`, so once the cache PVC is destroyed, the volume will also be destroyed). So no specially logics are needed for cache data GC.  
+
+### Data Size
+
+Cache volumes take storage space and cluster resources (PVC, PV), therefore, cache volumes should be created only when necessary and the volumes should be with reasonable size based on the cache data size:  
+- It is not a good bargain to have cache volumes for small backups, small backups will use resident cache location (the cache location in the root file system)
+- The cache data size has a limit, the existing `cacheLimitMB` is used for this purpose. E.g., it could be set as 1024 for a 1TB backup, which means 1GB of data is cached and the old cache data exceeding this size will be cleared. Therefore, it is meaningless to set the cache volume size much larger than `cacheLimitMB`
+
+### Cache Volume Size
+
+The cache volume size is calculated from below factors (for Restore scenarios):  
+- **Limit**: The limit of the cache data, that is represented by `cacheLimitMB`, the default value is 5GB
+- **backupSize**: The size of the backup as a reference to evaluate whether to create a cache volume. It doesn't mean the backup data really decides the cache data all the time, it is just a reference to evaluate the scale of the backup, small scale backups may need small cache data. Sometimes, backupSize is not irrelevant to the size of cache data, in this case, ResidentThreshold should not be set, Limit will be used directly. It is unlikely that backupSize is unavailable, but once that happens, ResidentThreshold is ignored, Limit will be used directly.  
+- **ResidentThreshold**: The minimum backup size that a cache volume is created
+- **InflationPercentage**: Considering the overhead of the file system and the possible delay of the cache cleanup, there should be an inflation for the final volume size vs. the logical size, otherwise, the cache volume may be overrun. This inflation percentage is hardcoded, e.g., 20%. 
+
+A formula is as below:  
+```
+cacheVolumeSize = ((backupSize != 0 ? (backupSize > residentThreshold ? limit : 0) : limit) * (100 + inflationPercentage)) / 100
+```
+Finally, the `cacheVolumeSize` will be rounded up to GiB considering the UX friendliness, storage friendliness and management friendliness.    
+
+### PVC/PV
+
+The PVC for a cache volume is created in Velero namespace and a storage class is required for the cache PVC. The PVC's accessMode is `ReadWriteOnce` and volumeMode is `FileSystem`, so the storage class provided should support this specification. Otherwise, if the storageclass doesn't support either of the specifications, the data mover pod may be hang in `Pending` state until a timeout setting with the data movement (e.g. `prepareTimeout`) and the data movement will finally fail.  
+It is not expected that the cache volume is retained after data mover pod is deleted, so the `reclaimPolicy` for the storageclass must be `Delete`.  
+
+To detect the problems in the storageclass and fail earlier, a validation is applied to the storageclass and once the validation fails, the cache configuration will be ignored, so the data mover pod will be created without a cache volume.  
+
+### Cache Volume Configurations
+
+Below configurations are introduced:
+- **residentThresholdMB**: the minimum data size(in MB) to be processed (if available) that a cache volume is created
+- **cacheStorageClass**: the name of the storage class to provision the cache PVC
+
+Not like `cacheLimitMB` which is set to and affect the backup repository, the above two configurations are actually data mover configurations of how to create cache volumes to data mover pods; and the two configurations don't need to be per backup repository. So we add them to the node-agent Configuration.  
+
+### Sample
+
+Below are some examples of the node-agent configMap with the configurations:
+
+Sample-1:  
+```json
+{
+    "cacheVolume": {
+        "storageClass": "sc-1",
+        "residentThresholdMB": 1024        
+    }
+}
+```
+
+Sample-2:  
+```json
+{
+    "cacheVolume": {
+        "storageClass": "sc-1",       
+    }
+}
+```
+
+Sample-3:  
+```json
+{
+    "cacheVolume": {
+        "residentThresholdMB": 1024        
+    }
+}
+```
+
+**sample-1**: This is a valid configuration. Restores with backup data size larger than 1G will be assigned a cache volume using storage class `sc-1`.  
+**sample-2**: This is a valid configuration. Data mover pods are always assigned a cache volume using storage class `sc-1`.  
+**sample-3**: This is not a valid configuration because the storage class is absent. Velero gives up creating a cache volume.   
+
+To create the configMap, users need to save something like the above sample to a json file and then run below command:
+```
+kubectl create cm <ConfigMap name> -n velero --from-file=<json file name>
+```
+
+The cache volume configurations will be visited by node-agent server, so they also need to specify the `--node-agent-configmap` to the `velero node-agent` parameters.  
+
+## Detailed Design
+
+### Backup and Restore
+
+The restore needs to know the backup size so as to calculate the cache volume size, some new fields are added to the DataDownload and PodVolumeRestore CRDs.  
+
+`snapshotSize` field is also added to DataDownload and PodVolumeRestore's `spec`:
+```yaml
+          spec:
+              snapshotID:
+                description: SnapshotID is the ID of the Velero backup snapshot to
+                  be restored from.
+                type: string
+              snapshotSize:
+                description: SnapshotSize is the logical size of the snapshot.
+                format: int64
+                type: integer
+```
+
+`snapshotSize` represents the total size of the backup; during restore, the value is transferred from DataUpload/PodVolumeBackup's `Status.Progress.TotalBytes` to DataDownload/PodVolumeRestore.    
+
+It is unlikely that `Status.Progress.TotalBytes` from DataUpload/PodVolumeBackup is unavailable, but once it happens, according to the above formula, `residentThresholdMB` is ignored, cache volume size is calculated directly from cache limit for the corresponding backup repository.  
+
+### Exposer
+
+Cache volume configurations are retrieved by node-agent and passed through DataDownload/PodVolumeRestore to GenericRestore exposer/PodVolume exposer.  
+The exposers are responsible to calculate cache volume size, create cache PVCs and mount them to the restorePods.  
+If the calculated cache volume size is 0, or any of the critical parameters is missing (e.g., cache volume storage class), the exposers ignore the cache volume configuration and continue with creating restorePods without cache volumes, so no impact to the result of the restore.  
+
+Exposers mount the cache volume to a predefined directory and pass the directory to the data mover pods through the `cache-volume-path` parameter.  
+
+Below data structure is added to the exposers' expose parameters:  
+
+```go
+type GenericRestoreExposeParam struct {
+	// RestoreSize specifies the data size for the volume to be restored
+	RestoreSize int64
+
+	// CacheVolume specifies the info for cache volumes
+	CacheVolume *CacheVolumeInfo
+}
+
+type PodVolumeExposeParam struct {
+	// RestoreSize specifies the data size for the volume to be restored
+	RestoreSize int64
+
+	// CacheVolume specifies the info for cache volumes
+	CacheVolume *repocache.CacheConfigs
+}
+
+type CacheConfigs struct {
+	// StorageClass specifies the storage class for cache volumes
+	StorageClass string
+
+	// Limit specifies the maximum size of the cache data
+	Limit int64
+
+	// ResidentThreshold specifies the minimum size of the cache data to create a cache volume
+	ResidentThreshold int64
+}
+```
+
+### Data Mover Pods
+
+Data mover pods retrieve the cache volume directory from `cache-volume-path` parameter and pass it to Unified Repository.  
+If the directory is empty, Unified Repository uses the resident location for data cache, that is, the root file system.  
+
+### Kopia Repository
+
+Kopia repository supports cache directory configuration for both metadata and data. The existing `SetupConnectOptions` is modified to customize the `CacheDirectory`:  
+
+```go
+func SetupConnectOptions(ctx context.Context, repoOptions udmrepo.RepoOptions) repo.ConnectOptions {
+    ...
+
+	return repo.ConnectOptions{
+		CachingOptions: content.CachingOptions{
+			CacheDirectory: cacheDir,
+			...
+		},
+		...
+	}
+}
+```  
+
+
+[1]: Implemented/unified-repo-and-kopia-integration/unified-repo-and-kopia-integration.md
+[2]: Implemented/vgdp-micro-service/vgdp-micro-service.md
+[3]: Implemented/vgdp-micro-service-for-fs-backup/vgdp-micro-service-for-fs-backup.md
+[4]: Implemented/repo_maintenance_job_config.md
+[5]: Implemented/backup-repo-config.md
--- a/design/bsl-certificate-support_design.md
+++ b/design/bsl-certificate-support_design.md
@@ -0,0 +1,417 @@
+# Design for BSL Certificate Support Enhancement
+
+## Abstract
+
+This design document describes the enhancement of BackupStorageLocation (BSL) certificate management in Velero, introducing a Secret-based certificate reference mechanism (`caCertRef`) alongside the existing inline certificate field (`caCert`). This enhancement provides a more secure, Kubernetes-native approach to certificate management while enabling future CLI improvements for automatic certificate discovery.
+
+## Background
+
+Currently, Velero supports TLS certificate verification for object storage providers through an inline `caCert` field in the BSL specification. While functional, this approach has several limitations:
+
+- **Security**: Certificates are stored directly in the BSL YAML, potentially exposing sensitive data
+- **Management**: Certificate rotation requires updating the BSL resource itself
+- **CLI Usability**: Users must manually specify certificates when using CLI commands
+- **Size Limitations**: Large certificate bundles can make BSL resources unwieldy
+
+Issue #9097 and PR #8557 highlight the need for improved certificate management that addresses these concerns while maintaining backward compatibility.
+
+## Goals
+
+- Provide a secure, Secret-based certificate storage mechanism
+- Maintain full backward compatibility with existing BSL configurations
+- Enable future CLI enhancements for automatic certificate discovery
+- Simplify certificate rotation and management
+- Provide clear migration path for existing users
+
+## Non-Goals
+
+- Removing support for inline certificates immediately
+- Changing the behavior of existing BSL configurations
+- Implementing client-side certificate validation
+- Supporting certificates from ConfigMaps or other resource types
+
+## High-Level Design
+
+### API Changes
+
+#### New Field: CACertRef
+
+```go
+type ObjectStorageLocation struct {
+    // Existing field (now deprecated)
+    // +optional
+    // +kubebuilder:deprecatedversion:warning="caCert is deprecated, use caCertRef instead"
+    CACert []byte `json:"caCert,omitempty"`
+
+    // New field for Secret reference
+    // +optional
+    CACertRef *corev1api.SecretKeySelector `json:"caCertRef,omitempty"`
+}
+```
+
+The `SecretKeySelector` follows standard Kubernetes patterns:
+```go
+type SecretKeySelector struct {
+    // Name of the Secret
+    Name string `json:"name"`
+    // Key within the Secret
+    Key string `json:"key"`
+}
+```
+
+### Certificate Resolution Logic
+
+The system follows a priority-based resolution:
+
+1. If `caCertRef` is specified, retrieve certificate from the referenced Secret
+2. If `caCert` is specified (and `caCertRef` is not), use the inline certificate
+3. If neither is specified, no custom CA certificate is used
+
+### Validation
+
+BSL validation ensures mutual exclusivity:
+```go
+func (bsl *BackupStorageLocation) Validate() error {
+    if bsl.Spec.ObjectStorage != nil &&
+        bsl.Spec.ObjectStorage.CACert != nil &&
+        bsl.Spec.ObjectStorage.CACertRef != nil {
+        return errors.New("cannot specify both caCert and caCertRef in objectStorage")
+    }
+    return nil
+}
+```
+
+## Detailed Design
+
+### BSL Controller Changes
+
+The BSL controller incorporates validation during reconciliation:
+
+```go
+func (r *backupStorageLocationReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
+    // ... existing code ...
+    
+    // Validate BSL configuration
+    if err := location.Validate(); err != nil {
+        r.logger.WithError(err).Error("BSL validation failed")
+        return ctrl.Result{}, err
+    }
+    
+    // ... continue reconciliation ...
+}
+```
+
+### Repository Provider Integration
+
+All repository providers implement consistent certificate handling:
+
+```go
+func configureCACert(bsl *velerov1api.BackupStorageLocation, credGetter *credentials.CredentialGetter) ([]byte, error) {
+    if bsl.Spec.ObjectStorage == nil {
+        return nil, nil
+    }
+
+    // Prefer caCertRef (new method)
+    if bsl.Spec.ObjectStorage.CACertRef != nil {
+        certString, err := credGetter.FromSecret.Get(bsl.Spec.ObjectStorage.CACertRef)
+        if err != nil {
+            return nil, errors.Wrap(err, "error getting CA certificate from secret")
+        }
+        return []byte(certString), nil
+    }
+
+    // Fall back to caCert (deprecated)
+    if bsl.Spec.ObjectStorage.CACert != nil {
+        return bsl.Spec.ObjectStorage.CACert, nil
+    }
+
+    return nil, nil
+}
+```
+
+### CLI Certificate Discovery Integration
+
+#### Background: PR #8557 Implementation
+PR #8557 ("CLI automatically discovers and uses cacert from BSL") was merged in August 2025, introducing automatic CA certificate discovery from BackupStorageLocation for Velero CLI download operations. This eliminated the need for users to manually specify the `--cacert` flag when performing operations like `backup describe`, `backup download`, `backup logs`, and `restore logs`.
+
+#### Current Implementation (Post PR #8557)
+The CLI now automatically discovers certificates from BSL through the `pkg/cmd/util/cacert/bsl_cacert.go` module:
+
+```go
+// Current implementation only supports inline caCert
+func GetCACertFromBSL(ctx context.Context, client kbclient.Client, namespace, bslName string) (string, error) {
+    // ... fetch BSL ...
+    if bsl.Spec.ObjectStorage != nil && len(bsl.Spec.ObjectStorage.CACert) > 0 {
+        return string(bsl.Spec.ObjectStorage.CACert), nil
+    }
+    return "", nil
+}
+```
+
+#### Enhancement with caCertRef Support
+This design extends the existing CLI certificate discovery to support the new `caCertRef` field:
+
+```go
+// Enhanced implementation supporting both caCert and caCertRef
+func GetCACertFromBSL(ctx context.Context, client kbclient.Client, namespace, bslName string) (string, error) {
+    // ... fetch BSL ...
+
+    // Prefer caCertRef over inline caCert
+    if bsl.Spec.ObjectStorage.CACertRef != nil {
+        secret := &corev1api.Secret{}
+        key := types.NamespacedName{
+            Name:      bsl.Spec.ObjectStorage.CACertRef.Name,
+            Namespace: namespace,
+        }
+        if err := client.Get(ctx, key, secret); err != nil {
+            return "", errors.Wrap(err, "error getting certificate secret")
+        }
+
+        certData, ok := secret.Data[bsl.Spec.ObjectStorage.CACertRef.Key]
+        if !ok {
+            return "", errors.Errorf("key %s not found in secret",
+                bsl.Spec.ObjectStorage.CACertRef.Key)
+        }
+        return string(certData), nil
+    }
+
+    // Fall back to inline caCert (deprecated)
+    if bsl.Spec.ObjectStorage.CACert != nil {
+        return string(bsl.Spec.ObjectStorage.CACert), nil
+    }
+
+    return "", nil
+}
+```
+
+#### Certificate Resolution Priority
+
+The CLI follows this priority order for certificate resolution:
+
+1. **`--cacert` flag** - Manual override, highest priority
+2. **`caCertRef`** - Secret-based certificate (recommended)
+3. **`caCert`** - Inline certificate (deprecated)
+4. **System certificate pool** - Default fallback
+
+#### User Experience Improvements
+
+With both PR #8557 and this enhancement:
+
+```bash
+# Automatic discovery - works with both caCert and caCertRef
+velero backup describe my-backup
+velero backup download my-backup
+velero backup logs my-backup
+velero restore logs my-restore
+
+# Manual override still available
+velero backup describe my-backup --cacert /custom/ca.crt
+
+# Debug output shows certificate source
+velero backup download my-backup --log-level=debug
+# [DEBUG] Resolved CA certificate from BSL 'default' Secret 'storage-ca-cert' key 'ca-bundle.crt'
+```
+
+#### RBAC Considerations for CLI
+
+CLI users need read access to Secrets when using `caCertRef`:
+
+```yaml
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: velero-cli-user
+  namespace: velero
+rules:
+- apiGroups: ["velero.io"]
+  resources: ["backups", "restores", "backupstoragelocations"]
+  verbs: ["get", "list"]
+- apiGroups: [""]
+  resources: ["secrets"]
+  verbs: ["get"]
+  # Limited to secrets referenced by BSLs
+```
+
+### Migration Strategy
+
+#### Phase 1: Introduction (Current)
+- Add `caCertRef` field
+- Mark `caCert` as deprecated
+- Both fields supported, mutual exclusivity enforced
+
+#### Phase 2: Migration Period
+- Documentation and tools to help users migrate
+- Warning messages for `caCert` usage
+- CLI enhancements to leverage `caCertRef`
+
+#### Phase 3: Future Removal
+- Remove `caCert` field in major version update
+- Provide migration tool for automatic conversion
+
+## User Experience
+
+### Creating a BSL with Certificate Reference
+
+1. Create a Secret containing the CA certificate:
+```yaml
+apiVersion: v1
+kind: Secret
+metadata:
+  name: storage-ca-cert
+  namespace: velero
+type: Opaque
+data:
+  ca-bundle.crt: <base64-encoded-certificate>
+```
+
+2. Reference the Secret in BSL:
+```yaml
+apiVersion: velero.io/v1
+kind: BackupStorageLocation
+metadata:
+  name: default
+  namespace: velero
+spec:
+  provider: aws
+  objectStorage:
+    bucket: my-bucket
+    caCertRef:
+      name: storage-ca-cert
+      key: ca-bundle.crt
+```
+
+### Certificate Rotation
+
+With Secret-based certificates:
+```bash
+# Update the Secret with new certificate
+kubectl create secret generic storage-ca-cert \
+  --from-file=ca-bundle.crt=new-ca.crt \
+  --dry-run=client -o yaml | kubectl apply -f -
+
+# No BSL update required - changes take effect on next use
+```
+
+### CLI Usage Examples
+
+#### Immediate Benefits
+- No change required for existing workflows
+- Certificate validation errors include helpful context
+
+#### Future CLI Enhancements
+```bash
+# Automatic certificate discovery
+velero backup download my-backup
+
+# Manual override still available
+velero backup download my-backup --cacert /custom/ca.crt
+
+# Debug certificate resolution
+velero backup download my-backup --log-level=debug
+# [DEBUG] Resolved CA certificate from BSL 'default' Secret 'storage-ca-cert'
+```
+
+## Security Considerations
+
+### Advantages of Secret-based Storage
+
+1. **Encryption at Rest**: Secrets are encrypted in etcd
+2. **RBAC Control**: Fine-grained access control via Kubernetes RBAC
+3. **Audit Trail**: Secret access is auditable
+4. **Separation of Concerns**: Certificates separate from configuration
+
+### Required Permissions
+
+The Velero server requires additional RBAC permissions:
+```yaml
+- apiGroups: [""]
+  resources: ["secrets"]
+  verbs: ["get"]
+  # Scoped to secrets referenced by BSLs
+```
+
+## Compatibility
+
+### Backward Compatibility
+
+- Existing BSLs with `caCert` continue to function unchanged
+- No breaking changes to API
+- Gradual migration path
+
+### Forward Compatibility
+
+- Design allows for future enhancements:
+  - Multiple certificate support
+  - Certificate chain validation
+  - Automatic certificate discovery from cloud providers
+
+## Implementation Phases
+
+### Phase 1: Core Implementation ✓ (Current PR)
+- API changes with new `caCertRef` field
+- Controller validation
+- Repository provider updates
+- Basic testing
+
+### Phase 2: CLI Enhancement (Future)
+- Automatic certificate discovery in CLI
+- Enhanced error messages
+- Debug logging for certificate resolution
+
+### Phase 3: Migration Tools (Future)
+- Automated migration scripts
+- Validation tools
+- Documentation updates
+
+## Testing
+
+### Unit Tests
+- BSL validation logic
+- Certificate resolution in providers
+- Controller behavior
+
+### Integration Tests
+- End-to-end backup/restore with `caCertRef`
+- Certificate rotation scenarios
+- Migration from `caCert` to `caCertRef`
+
+### Manual Testing Scenarios
+1. Create BSL with `caCertRef`
+2. Perform backup/restore operations
+3. Rotate certificate in Secret
+4. Verify continued operation
+
+## Documentation
+
+### User Documentation
+- Migration guide from `caCert` to `caCertRef`
+- Examples for common cloud providers
+- Troubleshooting guide
+
+### API Documentation
+- Updated API reference
+- Deprecation notices
+- Field descriptions
+
+## Alternatives Considered
+
+### ConfigMap-based Storage
+- Pros: Similar to Secrets, simpler API
+- Cons: Not designed for sensitive data, no encryption at rest
+- Decision: Secrets are the Kubernetes-standard for sensitive data
+
+### External Certificate Management
+- Pros: Integration with cert-manager, etc.
+- Cons: Additional complexity, dependencies
+- Decision: Keep it simple, allow users to manage certificates as needed
+
+### Immediate Removal of Inline Certificates
+- Pros: Cleaner API, forces best practices
+- Cons: Breaking change, migration burden
+- Decision: Gradual deprecation respects existing users
+
+## Conclusion
+
+This design provides a secure, Kubernetes-native approach to certificate management in Velero while maintaining backward compatibility. It establishes the foundation for enhanced CLI functionality and improved user experience, addressing the concerns raised in issue #9097 and enabling the features proposed in PR #8557.
+
+The phased approach ensures smooth migration for existing users while delivering immediate security benefits for new deployments.
--- a/design/concurrent-backup-processing.md
+++ b/design/concurrent-backup-processing.md
@@ -0,0 +1,257 @@
+# Concurrent Backup Processing
+
+This enhancement will enable Velero to process multiple backups at the same time. This is largely a usability enhancement rather than a performance enhancement, since the overall backup throughput may not be significantly improved over the current implementation, since we are already processing individual backup items in parallel. It is a significant usability improvement, though, as with the current design, a user who submits a small backup may have to wait significantly longer than expected if the backup is submitted immediately after a large backup.
+
+## Background
+
+With the current implementation, only one backup may be `InProgress` at a time. A second backup created will not start processing until the first backup moves on to `WaitingForPluginOperations` or `Finalizing`. This is a usability concern, especially in clusters when multiple users are initiating backups. With this enhancement, we intend to allow multiple backups to be processed concurrently. This will allow backups to start processing immediately, even if a large backup was just submitted by another user. This enhancement will build on top of the prior parallel item processing feature by creating a dedicatede ItemBlock worker pool for each running backup. The pool will be created at the beginning of the backup reconcile, and the input channel will be passed to the Kubernetes backupper just like it is in the current release.
+
+The primary challenge is to make sure that the same workload in multiple backups is not backed up concurrently. If that were to happen, we would risk data corruption, especially around the processing of pod hooks and volume backup. For this first release we will take a conservative, high-level approach to overlap detection. Two backups will not run concurrently if there is any overlap in included namespaces. For example, if a backup that includes `ns1` and `ns2` is running, then a second backup for `ns2` and `ns3` will not be started. If a backup which does not filter namespaces is running (either a whole cluster backup or a non-namespace-limited backup with a label selector) then no other backups will be started, since a backup across all namespaces overlaps with any other backup. Calculating item-level overlap for queued backups is problematic since we don't know which items are included in a backup until backup processing has begun. A future release may add ItemBlock overlap detection, where at the item block worker level, the same item will not be processed by two different workers at the same time. This works together with workload conflict detection to further detect conflicts in a more granular level for shared resources between backups. Eventually, with a more complete understanding of individual workloads (either via ItemBlocks or some higher level model), the namespace level overlap detection may be relaxed in future versions.
+
+## Goals
+- Process multiple backups concurrently
+- Detect namespace overlap to avoid conflicts
+- For queued backups (not yet runnable due to concurrency limits or overlap), indicate the queue position in status
+
+## Non Goals
+- Handling NFS PVs when more than one PV point to the same underlying NFS share
+- Handling VGDP cancellation for failed backups on restart
+- Mounting a PVC for scenarios in which /tmp is too small for the number of concurrent backups
+- Providing a mechanism to identify high priority backups which get preferential treatment in terms of ItemBlock worker availability
+- Item-level overlap detection (future feature)
+- Providing the ability to disable namespace-level overlap detection once Item-level overlap detection is in place (although this may be supported in a future version).
+
+## High-Level Design
+
+### Backup CRD changes
+
+Two new backup phases will be added: `Queued` and `ReadyToStart`. In the Backup workflow, new backups will be moved to the Queued phase when they are added to the backup queue. When a backup is removed from the queue because it is now able to run, it will be moved to the `ReadyToStart` phase, which will allow the backup controller to start processing it.
+
+In addition, a new Status field, `QueuePosition`, will be added to track the backup's current position in the queue.
+
+### New Controller: `backupQueueReconciler`
+
+A new reconciler will be added, `backupQueueReconciler` which will use the current `backupReconciler` logic for reconciling `New` backups but instead of running the backup, it will move the Backup to the `Queued` phase and set `QueuePosition`.
+
+In addition, this reconciler will periodically reconcile all queued backups (on some configurable time interval) and if there is a runnable backup, remove it from the queue, update `QueuePosition` for any queued backups behind it, and update its phase to `ReadyToStart`.
+
+Queued backups will be reconciled in order based on `QueuePosition`, so the first runnable backup found will be processed. A backup is runnable if both of the following conditions are true:
+1) The total number of backups either `InProgress` or `ReadyToStart` is less than the configured number of concurrent backups.
+2) The backup has no overlap with any backups currently `InProgress` or `ReadyToStart` or with any `Queued` backups with a higher (i.e. closer to 1) queue position than this backup.
+
+### Updates to Backup controller
+
+The current `backupReconciler` will change its reconciling rules. Instead of watching and reconciling New backups, it will reconcile `ReadyToStart` backups. In addition, it will be configured to run in parallel by setting `MaxConcurrentReconciles` based on the `concurrent-backups` server arg.
+
+The startup (and shutdown) of the ItemBlock worker pool will be moved from reconciler startup to the backup reconcile, which will give each running backup its own dedicated worker pool. The per-backup worker pool will will use the existing `--item-block-worker-count` installer/server arg. This means that the maximum number of ItemBlock workers for the entire Velero pod will be the ItemBlock worker count multiplied by concurrentBackups. For example, if concurrentBackups is 5, and itemBlockWorkerCount is 6, then there will be, at most, 30 worker threads active, 5 dedicated to each InProgress backup, but this maximum will only be achieved when the maximum number of backups are InProgress. This also means that each InProgress backup will have a dedicated ItemBlock input channel with the same fixed buffer size.
+
+## Detailed Design
+
+### New Install/Server configuration args
+
+A new install/server arg, `concurrent-backups` will be added. This will be an int-valued field specifying the number of backups which may be processed concurrently (with phase `InProgress`). If not specified, the default value of 1 will be used.
+
+### Consideration of backup overlap and concurrent backup processing
+
+The primary consideration for running additional backups concurrently is the configured `concurrent-backups` parameter. If the total number of `InProgress` and `ReadyToStart` backups is equal to `concurrent-backups` then any `Queued` backups will remain in the queue.
+
+The second consideration is backup overlap. In order to prevent interaction between running backups (particularly around volume backup and pod hooks), we cannot allow two overlapping backups to run at the same time. For now, we will define overlap broadly -- requiring that two concurrent backups don't include any of the same namespaces. A backup for `ns1` can run concurrently with a backup for `ns2`, but a backup for `[ns1,ns2]` cannot run concurrently with a backup for `ns1`. One consequence of this approach is that a backup which includes all namespaces (even if further filtered by resource or label) cannot run concurrently with *any other backup*.
+
+When determining which queued backup to run next, velero will look for the next queued backup which has no overlap with any InProgress backup or any Queued backup ahead of it. The reason we need to consider queued as well as running backups for overlap detection is as follows.
+
+Consider the following scenario. These are the current not-completed backups (ordered from oldest to newest)
+1. backup1, includedNamespaces: [ns1, ns2], phase: InProgress
+2. backup2, includedNamespaces: [ns2, ns3, ns5], phase: Queued, QueuePosition: 1
+3. backup3, includedNamespaces: [ns4, ns3], phase: Queued, QueuePosition: 2
+4. backup4, includedNamespaces: [ns5, ns6], phase: Queued, QueuePosition: 2
+5. backup5, includedNamespaces: [ns8, ns9], phase: Queued, QueuePosition: 3
+
+Assuming `concurrent-backups` is 2, on the next reconcile, Velero will be able to start a second backup if there is one with no overlap. `backup2` cannot run, since `ns2` overlaps between it and the running `backup1`. If we only considered running overlap (and not queued overlap), then `backup3` could run now. It conflicts with the queued `backup2` on `ns3` but it does not conflict with the running backup. However, if it runs now, then when `backup1` completes, then `backup2` still can't run (since it now overlaps with running `backup3`on `ns3`), so `backup4` starts instead. Now when `backup3` completes, `backup2` still can't run (since it now conflicts with `backup4` on `ns5`). This means that even though it was the second backup created, it's the fourth to run -- providing worse time to completion than without parallel backups. If a queued backup has a large number of namespaces (a full-cluster backup for example), it would never run as long as new single-namespace backups keep being added to the queue.
+
+To resolve this problem we consider both running backups as well as backups ahead in the queue when resolving overlap conflicts. In the above scenario, `backup2` can't run yet since it overlaps with the running backup on `ns2`. In addition, `backup3` and `backup4` also can't run yet since they overlap with queued `backup2`. Therefore, `backup5` will run now. Once `backup1` completes, `backup2` will be free to run.
+
+### Backup CRD changes
+
+New Backup phases:
+```go
+const (
+	// BackupPhaseQueued means the backup has been added to the
+	// queue by the BackupQueueReconciler.
+	BackupPhaseQueued BackupPhase = "Queued"
+
+	// BackupPhaseReadyToStart means the backup has been removed from the
+	// queue by the BackupQueueReconciler and is ready to start.
+	BackupPhaseReadyToStart BackupPhase = "ReadyToStart"
+)
+```
+
+In addition, a new Status field, `queuePosition`, will be added to track the backup's current position in the queue.
+```go
+	// QueuePosition is the position held by the backup in the queue.
+	// QueuePosition=1 means this backup is the next to be considered.
+	// Only relevant when Phase is "Queued"
+	// +optional
+	QueuePosition int `json:"queuePosition,omitempty"`
+```
+
+### New Controller: `backupQueueReconciler`
+
+A new reconciler will be added, `backupQueueReconciler` which will reconcile backups under these conditions:
+1) Watching Create/Update for backups in `New` (or empty) phase
+2) Watching for Backup phase transition from `InProgress` to something else to reconcile all `Queued` backups
+2) Watching for Backup phase transition from `New` (or empty) to `Queued` to reconcile all `Queued` backups
+2) Periodic reconcile of `Queued` backups to handle backups queued at server startup as well as to make sure we never have a situation where backups are queued indefinitely because of a race condition or was otherwise missed in the reconcile on prior backup completion.
+
+The reconciler will be set up as follows -- note that New backups are reconciled on Create/Update, while Queued backups are reconciled when an InProgress backup moves on to another state or when a new backup moves to the Queued state. We also reconcile Queued backups periodically to handle the case of a Velero pod restart with Queued backups, as well as to handle possible edge cases where a queued backup doesn't get moved out of the queue at the point of backup completion or an error occurs during a prior Queued backup reconcile.
+
+```go
+func (c *backupOperationsReconciler) SetupWithManager(mgr ctrl.Manager) error {
+        // only consider Queued backups, order by QueuePosition
+	gp := kube.NewGenericEventPredicate(func(object client.Object) bool {
+		backup := object.(*velerov1api.Backup)
+		return (backup.Status.Phase == velerov1api.BackupPhaseQueued)
+	})
+	s := kube.NewPeriodicalEnqueueSource(c.logger.WithField("controller", constant.ControllerBackupOperations), mgr.GetClient(), &velerov1api.BackupList{}, c.frequency, kube.PeriodicalEnqueueSourceOption{
+		Predicates: []predicate.Predicate{gp},
+		OrderFunc: queuePositionOrderFunc,
+	})
+
+	return ctrl.NewControllerManagedBy(mgr).
+		For(&velerov1api.Backup{}, builder.WithPredicates(predicate.Funcs{
+				UpdateFunc: func(ue event.UpdateEvent) bool {
+					backup := ue.ObjectNew.(*velerov1api.Backup)
+					return backup.Status.Phase == "" || backup.status.Phase == velerov1api.BackupPhaseNew
+				},
+				CreateFunc: func(event.CreateEvent) bool {
+					return backup.Status.Phase == "" || backup.status.Phase == velerov1api.BackupPhaseNew
+				},
+				DeleteFunc: func(de event.DeleteEvent) bool {
+					return false
+				},
+				GenericFunc: func(ge event.GenericEvent) bool {
+					return false
+				},
+		})).
+		Watch(
+			&source.Kind{Type: &velerov1api.Backup{}},
+			&handler.EnqueueRequestsFromMapFunc{
+				ToRequests: handler.ToRequestsFunc(func(a handler.MapObject) []reconcile.Request {
+					backupList := velerov1api.BackupList{}
+					if err := p.List(ctx, backupList); err != nil {
+						p.logger.WithError(err).Error("error listing backups")
+						return
+					}
+					requests = []reconcile.request{}
+					// filter backup list by Phase=queued
+					// sort backup list by queuePosition
+					return requests
+				}),
+		},
+		builder.WithPredicates(predicate.Funcs{
+				UpdateFunc: func(ue event.UpdateEvent) bool {
+					oldBackup := ue.ObjectOld.(*velerov1api.Backup)
+					newBackup := ue.ObjectNew.(*velerov1api.Backup)
+					return oldBackup.Status.Phase == velerov1api.BackupPhaseInProgress &&
+						newBackup.Status.Phase != velerov1api.BackupPhaseInProgress ||
+						oldBackup.Status.Phase != velerov1api.BackupPhaseQueued &&
+						newBackup.Status.Phase == velerov1api.BackupPhaseQueued
+				},
+				CreateFunc: func(event.CreateEvent) bool {
+					return false
+				},
+				DeleteFunc: func(de event.DeleteEvent) bool {
+					return false
+				},
+				GenericFunc: func(ge event.GenericEvent) bool {
+					return false
+				},
+		}).
+		WatchesRawSource(s).
+		Named(constant.ControllerBackupQueue).
+		Complete(c)
+}
+```
+
+New backups will be queued: Phase will be set to `Queued`, and `QueuePosition` will be set to a int value incremented from the highest current `QueuePosition` value among Queued backups.
+
+Queued backups will be removed from the queue if runnable:
+1) If the total number of backups either InProgress or ReadyToStart is greater than or equal to the concurrency limit, then exit without removing from the queue.
+2) If the current backup overlaps with any InProgress, ReadyToStart, or Queued backup with `QueuePosition < currentBackup.QueuePosition` then exit without removing from the queue.
+3) If we get here, the backup is runnable. To resolve a potential race condition where an InProgress backup completes between reconciling the backup with QueuePosition `n-1` and reconciling the current backup with QueuePosition `n`, we also check to see whether there are any runnable backups in the queue ahead of this one. The only time this will happen is if a backup completes immediately before reconcile starts which either frees up a concurrency slot or removes a namespace conflict. In this case, we don't want to run the current backup since the one ahead of this one in the queue (which was recently passed over before the InProgress backup completed) must run first. In this case, exit without removing from the queue.
+4) If we get here, remove the backup from the queue by setting Phase to `ReadyToStart` and `QueuePosition` to zero. Decrement the `QueuePosition` of any other Queued backups with a `QueuePosition` higher than the current backup's queue position prior to dequeuing. At this point, the backup reconciler will start the backup.
+
+`if len(inProgressBackups)+len(pendingStartBackups) >= concurrentBackups`
+
+```
+	switch original.Status.Phase {
+	case "", velerov1api.BackupPhaseNew:
+		// enqueue backup -- set phase=Queued, set queuePosition=maxCurrentQueuePosition+1
+	}
+	// We should only ever get these events when added in order by the periodical enqueue source
+	// so as long as the current backup has not conflicts ahead of it or running, we should be good to
+	// dequeue
+	case "", velerov1api.BackupPhaseQueued:
+		// list backups, filter on Queued, ReadyToStart, and InProgress
+		// if number of InProgress backups + number of ReadyToStart backups >= concurrency limit, exit
+		// generate list of all namespaces included in InProgress, ReadyToStart, and Queued backups with
+		// queuePosition < backup.Status.QueuePosition
+		// if overlap found, exit
+		// check backups ahead of this one in the queue for runnability. If any are runnable, exit
+		// dequeue backup: set Phase to ReadyToStart, QueuePosition to 0, and decrement QueuePosition
+		// for all QueuedBackups behind this one in the queue
+	}
+
+```
+
+The queue controller will run as a single reconciler thread, so we will not need to deal with concurrency issues when moving backups from New to Queued or from Queued to ReadyToStart, and all of the updates to QueuePosition will be from a single thread.
+
+### Updates to Backup controller
+
+The Reconcile logic will be updated to respond to ReadyToStart backups instead of New backups:
+
+```
+@@ -234,8 +234,8 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
+        // InProgress, we still need this check so we can return nil to indicate we've finished processing
+        // this key (even though it was a no-op).
+        switch original.Status.Phase {
+-       case "", velerov1api.BackupPhaseNew:
+-               // only process new backups
+       case velerov1api.BackupPhaseReadyToStart:
+               // only process ReadyToStart backups
+        default:
+                b.logger.WithFields(logrus.Fields{
+                        "backup": kubeutil.NamespaceAndName(original),
+```
+
+In addition, it will be configured to run in parallel by setting `MaxConcurrentReconciles` based on the `concurrent-backups` server arg.
+
+```
+@@ -149,6 +149,9 @@ func NewBackupReconciler(
+ func (b *backupReconciler) SetupWithManager(mgr ctrl.Manager) error {
+        return ctrl.NewControllerManagedBy(mgr).
+                For(&velerov1api.Backup{}).
+                WithOptions(controller.Options{
+                       MaxConcurrentReconciles: concurrentBackups,
+               }).
+                Named(constant.ControllerBackup).
+                Complete(b)
+ }
+```
+
+The controller-runtime core reconciler logic already prevents the same resource from being reconciled by two different reconciler threads, so we don't need to worry about concurrency issues at the controller level.
+
+The workerPool reference will be moved from the backupReconciler to the backupRequest, since this will now be backup-specific, and the initialization code for the worker pool will be moved from the reconciler init into the backup reconcile. This worker pool will be shut down upon exiting the Reconcile method.
+
+### Resilience to restart of velero pod
+
+The new backup phases (`Queued` and `ReadyToStart`) will be resilient to velero pod restarts. If the velero pod crashes or is restarted, only backups in the `InProgress` phase will be failed, so there is no change to current behavior. Queued backups will retain their queue position on restart, and ReadyToStart backups will move to InProgress when reconciled.
+
+### Observability
+
+#### Logging
+
+When a backup is dequeued, an info log message will also include the wait time, calculated as `now - creationTimestamp`. When a backup is passed over due to overlap, an info log message will indicate which namespaces were in conflict.
+
+#### Velero CLI
+
+The `velero backup describe` output will include the current queue position for queued backups.
--- a/design/wildcard-namespace-support-design.md
+++ b/design/wildcard-namespace-support-design.md
@@ -0,0 +1,115 @@
+
+# Wildcard Namespace Support
+
+## Abstract
+
+Velero currently treats namespace patterns with glob characters as literal strings. This design adds wildcard expansion to support flexible namespace selection using patterns like `app-*` or `test-{dev,staging}`.
+
+## Background
+
+Requested in [#1874](https://github.com/vmware-tanzu/velero/issues/1874) for more flexible namespace selection.
+
+## Goals
+
+- Support glob pattern expansion in namespace includes/excludes
+- Maintain backward compatibility with existing `*` behavior
+
+## Non-Goals
+
+- Complex regex patterns beyond basic globs
+
+## High-Level Design
+
+Wildcard expansion occurs early in both backup and restore flows, converting patterns to literal namespace lists before normal processing.
+
+### Backup Flow
+
+Expansion happens in `getResourceItems()` before namespace collection:
+1. Check if wildcards exist using `ShouldExpandWildcards()`
+2. Expand patterns against active cluster namespaces
+3. Replace includes/excludes with expanded literal namespaces
+4. Continue with normal backup processing
+
+### Restore Flow
+
+Expansion occurs in `execute()` after parsing backup contents:
+1. Extract available namespaces from backup tar
+2. Expand patterns against backup namespaces (not cluster namespaces)
+3. Update restore context with expanded namespaces
+4. Continue with normal restore processing
+
+This ensures restore wildcards match actual backup contents, not current cluster state.
+
+## Detailed Design
+
+### Status Fields
+
+Add wildcard expansion tracking to backup and restore CRDs:
+
+```go
+type WildcardNamespaceStatus struct {
+    // IncludeWildcardMatches records namespaces that matched include patterns
+    // +optional
+    IncludeWildcardMatches []string `json:"includeWildcardMatches,omitempty"`
+    
+    // ExcludeWildcardMatches records namespaces that matched exclude patterns  
+    // +optional
+    ExcludeWildcardMatches []string `json:"excludeWildcardMatches,omitempty"`
+    
+    // WildcardResult records final namespaces after wildcard processing
+    // +optional
+    WildcardResult []string `json:"wildcardResult,omitempty"`
+}
+
+// Added to both BackupStatus and RestoreStatus
+type BackupStatus struct {
+    // WildcardNamespaces contains wildcard expansion results
+    // +optional
+    WildcardNamespaces *WildcardNamespaceStatus `json:"wildcardNamespaces,omitempty"`
+}
+```
+
+### Wildcard Expansion Package
+
+New `pkg/util/wildcard/expand.go` package provides:
+
+- `ShouldExpandWildcards()` - Skip expansion for simple "*" case
+- `ExpandWildcards()` - Main expansion function using `github.com/gobwas/glob`
+- Pattern validation rejecting unsupported regex symbols
+
+**Supported patterns**: `*`, `?`, `[abc]`, `{a,b,c}`  
+**Unsupported**: `|()`, `**`
+
+### Implementation Details
+
+#### Backup Integration (`pkg/backup/item_collector.go`)
+
+Expansion in `getResourceItems()`:
+- Call `wildcard.ExpandWildcards()` with cluster namespaces
+- Update `NamespaceIncludesExcludes` with expanded results
+- Populate status fields with expansion results
+
+#### Restore Integration (`pkg/restore/restore.go`)
+
+Expansion in `execute()`:
+```go
+if wildcard.ShouldExpandWildcards(includes, excludes) {
+    availableNamespaces := extractNamespacesFromBackup(backupResources)
+    expandedIncludes, expandedExcludes, err := wildcard.ExpandWildcards(
+        availableNamespaces, includes, excludes)
+    // Update context and status
+}
+```
+
+## Alternatives Considered
+
+1. **Client-side expansion**: Rejected because it wouldn't work for scheduled backups
+2. **Expansion in `collectNamespaces`**: Rejected because these functions expect literal namespaces
+
+## Compatibility
+
+Maintains full backward compatibility - existing "*" behavior unchanged.
+
+## Implementation
+
+Target: Velero 1.18
--- a/examples/minio/00-minio-deployment.yaml
+++ b/examples/minio/00-minio-deployment.yaml
@@ -107,7 +107,7 @@ spec:
        command:
        - /bin/sh
        - -c
-        - "mc --config-dir=/config config host add velero http://minio:9000 minio minio123 && mc --config-dir=/config mb -p velero/velero"
+        - "mc --config-dir=/config alias set velero http://minio:9000 minio minio123 && mc --config-dir=/config mb -p velero/velero"
        volumeMounts:
        - name: config
          mountPath: "/config"
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				Add `--apply` flag to `install` command, allowing usage of Kubernetes apply to make changes to existing installs
				`@@ -0,0 +1 @@`
				`feat: Enhance BackupStorageLocation with Secret-based CA certificate support`
				`@@ -0,0 +1 @@`
				`Fix issue #7725, add design for backup repo cache configuration`
				`@@ -0,0 +1 @@`
				`Add VolumePolicy support for PVC Phase conditions to allow skipping Pending PVCs`
				`@@ -0,0 +1 @@`
				`feat: Permit specifying annotations for the BackupPVC`
				`@@ -0,0 +1 @@`
				`Remove labels associated with previous backups`
				`@@ -0,0 +1 @@`
				`Fix issue #9229, don't attach backupPVC to the source node`
				`@@ -0,0 +1 @@`
				`Protect VolumeSnapshot field from race condition during multi-thread backup`
				`@@ -0,0 +1 @@`
				`Fix repository maintenance jobs to inherit allowlisted tolerations from Velero deployment`
				`@@ -1 +0,0 @@`
				`Backport to 1.16 (PR#9244 Update AzureAD Microsoft Authentication Library to v1.5.0)`