diff --git a/drivers/volume/portworx/portworx.go b/drivers/volume/portworx/portworx.go index 4dd4127256..7d0178b0da 100644 --- a/drivers/volume/portworx/portworx.go +++ b/drivers/volume/portworx/portworx.go @@ -3343,14 +3343,20 @@ func (p *portworx) StartBackup(backup *storkapi.ApplicationBackup, } return true, nil }) - if err != nil || cloudBackupCreateErr != nil { if isCloudBackupServerBusyError(cloudBackupCreateErr) { - return volumeInfos, &storkvolume.ErrStorageProviderBusy{Reason: cloudBackupCreateErr.Error()} + volumeInfo.Status = storkapi.ApplicationBackupStatusFailed + volumeInfo.Reason = cloudBackupCreateErr.Error() + volumeInfos = append(volumeInfos, volumeInfo) + continue } if _, ok := cloudBackupCreateErr.(*ost_errors.ErrExists); !ok { - return nil, fmt.Errorf("failed to start backup for %v (%v/%v): %v", + volumeInfo.Status = storkapi.ApplicationBackupStatusFailed + volumeInfo.Reason = fmt.Sprintf("%v", cloudBackupCreateErr) + volumeInfos = append(volumeInfos, volumeInfo) + logrus.Infof("failed to start backup for %v (%v/%v): %v", volume, pvc.Namespace, pvc.Name, cloudBackupCreateErr) + continue } } else if err == nil { // Only add volumeInfos if this was a successful backup @@ -3371,31 +3377,53 @@ func (p *portworx) GetBackupStatus(backup *storkapi.ApplicationBackup) ([]*stork volumeInfos := make([]*storkapi.ApplicationBackupVolumeInfo, 0) for _, vInfo := range backup.Status.Volumes { if vInfo.DriverName != storkvolume.PortworxDriverName { + volumeInfos = append(volumeInfos, vInfo) + continue + } + // Skip for volumes which are in failed state as there is no need to proceed + // further and we have to return the orginal volInfo back to caller + if vInfo.Status == storkapi.ApplicationBackupStatusFailed { + volumeInfos = append(volumeInfos, vInfo) continue } token, err := p.getUserToken(vInfo.Options, vInfo.Namespace) if err != nil { - return nil, fmt.Errorf("failed to fetch portworx user token: %v", err) + logrus.Errorf("failed to fetch portworx user token: %v", err) + vInfo.Reason = fmt.Sprintf("failed to fetch portworx user token: %v", err) + vInfo.Status = storkapi.ApplicationBackupStatusFailed + volumeInfos = append(volumeInfos, vInfo) + continue } volDriver, ok := driverMap[token] if !ok { volDriver, _, err = p.getUserVolDriverFromToken(token) if err != nil { - return nil, err + vInfo.Status = storkapi.ApplicationBackupStatusFailed + vInfo.Reason = fmt.Sprintf("%v", err) + logrus.Errorf("%v", err) + volumeInfos = append(volumeInfos, vInfo) + continue } driverMap[token] = volDriver } cloudBackupClient, err := p.getCloudBackupClient() if err != nil { - return nil, err + vInfo.Status = storkapi.ApplicationBackupStatusFailed + vInfo.Reason = fmt.Sprintf("%v", err) + volumeInfos = append(volumeInfos, vInfo) + logrus.Errorf("%v", err) } ctx, cancel := context.WithTimeout(context.Background(), cloudBackupTimeout) defer cancel() if len(token) > 0 { ctx, err = p.addTokenToContext(ctx, token) if err != nil { - return nil, err + vInfo.Status = storkapi.ApplicationBackupStatusFailed + vInfo.Reason = fmt.Sprintf("%v", err) + volumeInfos = append(volumeInfos, vInfo) + logrus.Errorf("%v", err) + } } diff --git a/pkg/apis/stork/v1alpha1/applicationbackup.go b/pkg/apis/stork/v1alpha1/applicationbackup.go index a4cbd7c330..6c9f405dc9 100644 --- a/pkg/apis/stork/v1alpha1/applicationbackup.go +++ b/pkg/apis/stork/v1alpha1/applicationbackup.go @@ -71,6 +71,7 @@ type ApplicationBackupStatus struct { TotalSize uint64 `json:"totalSize"` ResourceCount int `json:"resourceCount"` LargeResourceEnabled bool `json:"largeResourceEnabled"` + FailedVolCount int `json:"failedVolCount"` } // ObjectInfo contains info about an object being backed up or restored @@ -83,6 +84,8 @@ type ObjectInfo struct { // ApplicationBackupResourceInfo is the info for the backup of a resource type ApplicationBackupResourceInfo struct { ObjectInfo `json:",inline"` + Status ApplicationBackupStatusType `json:"status"` + Reason string `json:"reason"` } // ApplicationBackupVolumeInfo is the info for the backup of a volume @@ -120,6 +123,8 @@ const ( ApplicationBackupStatusPartialSuccess ApplicationBackupStatusType = "PartialSuccess" // ApplicationBackupStatusSuccessful for when backup has completed successfully ApplicationBackupStatusSuccessful ApplicationBackupStatusType = "Successful" + // ApplicationBackupStatusSkip for when backup has been skipped + ApplicationBackupStatusSkip ApplicationBackupStatusType = "Skipped" ) // ApplicationBackupStageType is the stage of the backup diff --git a/pkg/applicationmanager/controllers/applicationbackup.go b/pkg/applicationmanager/controllers/applicationbackup.go index 93aa26a233..b235d9fb78 100644 --- a/pkg/applicationmanager/controllers/applicationbackup.go +++ b/pkg/applicationmanager/controllers/applicationbackup.go @@ -574,6 +574,8 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio backup.Status.Stage = stork_api.ApplicationBackupStageVolumes namespacedName := types.NamespacedName{} + partialFailed := false + partialSuccess := false if IsVolsToBeBackedUp(backup) { isResourceTypePVC := IsResourceTypePVC(backup) objectMap := stork_api.CreateObjectsMap(backup.Spec.IncludeResources) @@ -597,6 +599,9 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio } driverType := kdmpData.Data[genericBackupKey] logrus.Tracef("driverType: %v", driverType) + if backup.Status.Volumes == nil { + backup.Status.Volumes = make([]*stork_api.ApplicationBackupVolumeInfo, 0) + } for _, pvc := range pvcList.Items { // If a list of resources was specified during backup check if // this PVC was included @@ -653,9 +658,6 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio } } } - if backup.Status.Volumes == nil { - backup.Status.Volumes = make([]*stork_api.ApplicationBackupVolumeInfo, 0) - } namespacedName.Namespace = backup.Namespace namespacedName.Name = backup.Name @@ -673,13 +675,14 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio batchCount = defaultBackupVolumeBatchCount } } + // Will focus on only important errors like startbackup() failure which is responsible + // for creating a vol backup. If this fails, then we will move onto next vol and no retries. + // Again trying next time there is no guarnatee that vol backup will pass. + // For transit errors before startBackup() we will return from the reconciler to be tried again for i := 0; i < len(pvcs); i += batchCount { batch := pvcs[i:min(i+batchCount, len(pvcs))] volumeInfos, err := driver.StartBackup(backup, batch) if err != nil { - // TODO: If starting backup for a drive fails mark the entire backup - // as Cancelling, cancel any other started backups and then mark - // it as failed if _, ok := err.(*volume.ErrStorageProviderBusy); ok { inProgressMsg := fmt.Sprintf("Volume backups are in progress. Backups are failing for some volumes"+ " since the storage provider is busy: %v. Backup will be retried", err) @@ -698,22 +701,25 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio if updateErr != nil { log.ApplicationBackupLog(backup).Errorf("failed to update backup object: %v", updateErr) } - return err + continue } message := fmt.Sprintf("Error starting ApplicationBackup for volumes: %v", err) log.ApplicationBackupLog(backup).Errorf(message) a.recorder.Event(backup, v1.EventTypeWarning, - string(stork_api.ApplicationBackupStatusFailed), + string(stork_api.ApplicationBackupStatusInProgress), message) _, err = a.updateBackupCRInVolumeStage( namespacedName, - stork_api.ApplicationBackupStatusFailed, - stork_api.ApplicationBackupStageFinal, - message, - nil, + stork_api.ApplicationBackupStatusInProgress, + backup.Status.Stage, + "Volume backups are in progress", + volumeInfos, ) - return err + if err != nil { + logrus.Errorf("%v", err) + } + continue } backup, err = a.updateBackupCRInVolumeStage( namespacedName, @@ -723,7 +729,7 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio volumeInfos, ) if err != nil { - return err + continue } } } @@ -731,6 +737,7 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio // In case Portworx if the snapshot ID is populated for every volume then the snapshot // process is considered to be completed successfully. // This ensures we don't execute the post-exec before all volume's snapshot is completed + volumeInfosAll := make([]*stork_api.ApplicationBackupVolumeInfo, 0) for driverName := range pvcMappings { var driver volume.Driver driver, err = volume.Get(driverName) @@ -739,8 +746,10 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio } if driverName == volume.PortworxDriverName { volumeInfos, err := driver.GetBackupStatus(backup) + volumeInfosAll = append(volumeInfosAll, volumeInfos...) if err != nil { - return fmt.Errorf("error getting backup status: %v", err) + logrus.Errorf("error getting backup status: %v", err) + continue } for _, volInfo := range volumeInfos { if volInfo.BackupID == "" { @@ -752,8 +761,13 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio return nil } } + } } + backup.Status.Volumes = volumeInfosAll + // NOTE: After this point don't initialize "backup.Status.Volumes" as this contains + // the first iterated list of PVC's which failed to be backed up and we are in + // Partial success state. For all purpose for volInfo always use backup.Status.Volumes // Run any post exec rules once backup is triggered driverCombo := a.checkVolumeDriverCombination(backup.Status.Volumes) @@ -797,23 +811,30 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio drivers := a.getDriversForBackup(backup) volumeInfos := make([]*stork_api.ApplicationBackupVolumeInfo, 0) for driverName := range drivers { - driver, err := volume.Get(driverName) if err != nil { return err } - status, err := driver.GetBackupStatus(backup) if err != nil { - return fmt.Errorf("error getting backup status for driver %v: %v", driverName, err) + // This will have status of failed volinfo whose status could not be got + // We need them also to be added to the list + logrus.Errorf("error getting backup status for driver %v: %v", driverName, err) + volumeInfos = append(volumeInfos, status...) + continue } + volumeInfos = append(volumeInfos, status...) } - backup.Status.Volumes = volumeInfos + backup.Status.Volumes = volumeInfos + // As part of partial success volumeInfos is already available, just update the same to backup CR + err = a.client.Update(context.TODO(), backup) + if err != nil { + return err + } // Now check if there is any failure or success - // TODO: On failure of one volume cancel other backups? - for _, vInfo := range volumeInfos { + for _, vInfo := range backup.Status.Volumes { if vInfo.Status == stork_api.ApplicationBackupStatusInProgress || vInfo.Status == stork_api.ApplicationBackupStatusInitial || vInfo.Status == stork_api.ApplicationBackupStatusPending { log.ApplicationBackupLog(backup).Infof("Volume backup still in progress: %v", vInfo.Volume) @@ -823,16 +844,15 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio v1.EventTypeWarning, string(vInfo.Status), fmt.Sprintf("Error backing up volume %v: %v", vInfo.Volume, vInfo.Reason)) - backup.Status.Stage = stork_api.ApplicationBackupStageFinal backup.Status.FinishTimestamp = metav1.Now() - backup.Status.Status = stork_api.ApplicationBackupStatusFailed - backup.Status.Reason = vInfo.Reason - break + partialFailed = true + // break } else if vInfo.Status == stork_api.ApplicationBackupStatusSuccessful { a.recorder.Event(backup, v1.EventTypeNormal, string(vInfo.Status), fmt.Sprintf("Volume %v backed up successfully", vInfo.Volume)) + partialSuccess = true } } } @@ -841,7 +861,7 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio if inProgress { // temporarily store the volume status, So that it will be used during retry. volumeInfos := backup.Status.Volumes - backup.Status.LastUpdateTimestamp = metav1.Now() + backup.Status.LastUpdateTimestamp = metav1.Now() //TODO: Need to have discussion on this for the current 30mins timeout we have // Store the new status err = a.client.Update(context.TODO(), backup) if err != nil { @@ -874,7 +894,7 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio // Run any post exec rules once backup is triggered driverCombo := a.checkVolumeDriverCombination(backup.Status.Volumes) - // If the driver combination of volumes onlykdmp or mixed of both kdmp and non-kdmp, call post exec rule + // If the driver combination of volumes only kdmp or mixed of both kdmp and non-kdmp, call post exec rule // backup of volume is success. if driverCombo == kdmpDriverOnly || driverCombo == mixedDriver { // Let's kill the pre-exec rule pod here so that application specific @@ -886,7 +906,6 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio logrus.Infof("Sending termination commands to kill pre-exec pod in kdmp or mixed driver path") channel <- true } - //terminationChannels = nil if backup.Spec.PostExecRule != "" { err = a.runPostExecRule(backup) if err != nil { @@ -910,6 +929,21 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio } } } + + if !partialSuccess && partialFailed { + // This case signifies that none of the volumes are successfully backed up + // hence marking it as failed + backup.Status.Stage = stork_api.ApplicationBackupStageFinal + backup.Status.FinishTimestamp = metav1.Now() + backup.Status.Status = stork_api.ApplicationBackupStatusFailed + backup.Status.Reason = "Volume backups failed" + backup.Status.LastUpdateTimestamp = metav1.Now() + err = a.client.Update(context.TODO(), backup) + if err != nil { + return err + } + } + // If the backup hasn't failed move on to the next stage. if backup.Status.Status != stork_api.ApplicationBackupStatusFailed { backup.Status.Stage = stork_api.ApplicationBackupStageApplications @@ -949,6 +983,9 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio } } + // We will not handle individual failure of resources as GetResources() being generic package + // returns error for the whole and it as no view of backp CR object. Also it is unlikely that + // only a particular resource fetching fails and rest passes. err = a.backupResources(backup) if err != nil { message := fmt.Sprintf("Error backing up resources: %v", err) @@ -1558,7 +1595,6 @@ func (a *ApplicationBackupController) backupResources( return err } } - // Don't modify resources if mentioned explicitly in specs resourceCollectorOpts := resourcecollector.Options{} resourceCollectorOpts.ResourceCountLimit = k8sutils.DefaultResourceCountLimit @@ -1662,6 +1698,46 @@ func (a *ApplicationBackupController) backupResources( } } } + // Handling partial success case - If a vol is in failed/skipped state + // skip the resource collection for the same + processPartialObjects := make([]runtime.Unstructured, 0) + failedVolInfoMap := make(map[string]stork_api.ApplicationBackupStatusType) + for _, vol := range backup.Status.Volumes { + if vol.Status == stork_api.ApplicationBackupStatusFailed { + failedVolInfoMap[vol.Volume] = vol.Status + } + } + backup.Status.FailedVolCount = len(failedVolInfoMap) + for _, obj := range allObjects { + objectType, err := meta.TypeAccessor(obj) + if err != nil { + return err + } + if objectType.GetKind() == "PersistentVolumeClaim" { + var pvc v1.PersistentVolumeClaim + // Find the matching object, skip + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.UnstructuredContent(), &pvc); err != nil { + return fmt.Errorf("error converting to persistent volume: %v", err) + } + if _, ok := failedVolInfoMap[pvc.Spec.VolumeName]; ok { + continue + } + } else if objectType.GetKind() == "PersistentVolume" { + var pv v1.PersistentVolume + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.UnstructuredContent(), &pv); err != nil { + return fmt.Errorf("error converting to persistent volume: %v", err) + } + if _, ok := failedVolInfoMap[pv.Name]; ok { + continue + } + } else { + processPartialObjects = append(processPartialObjects, obj) + } + processPartialObjects = append(processPartialObjects, obj) + } + + allObjects = processPartialObjects + if backup.Status.Resources == nil { // Save the collected resources infos in the status resourceInfos := make([]*stork_api.ApplicationBackupResourceInfo, 0) @@ -1895,7 +1971,11 @@ func (a *ApplicationBackupController) backupResources( backup.Status.BackupPath = GetObjectPath(backup) backup.Status.Stage = stork_api.ApplicationBackupStageFinal backup.Status.FinishTimestamp = metav1.Now() - backup.Status.Status = stork_api.ApplicationBackupStatusSuccessful + if backup.Status.FailedVolCount > 0 { + backup.Status.Status = stork_api.ApplicationBackupStatusPartialSuccess + } else { + backup.Status.Status = stork_api.ApplicationBackupStatusSuccessful + } if len(backup.Spec.NamespaceSelector) != 0 && len(backup.Spec.Namespaces) == 0 { backup.Status.Reason = "Namespace label selector did not find any namespaces with selected labels for backup" } else { @@ -1906,6 +1986,7 @@ func (a *ApplicationBackupController) backupResources( for _, vInfo := range backup.Status.Volumes { backup.Status.TotalSize += vInfo.TotalSize } + // Upload the metadata for the backup to the backup location if err = a.uploadMetadata(backup); err != nil { a.recorder.Event(backup, diff --git a/pkg/applicationmanager/controllers/applicationrestore.go b/pkg/applicationmanager/controllers/applicationrestore.go index c79785cb02..dd9c53492b 100644 --- a/pkg/applicationmanager/controllers/applicationrestore.go +++ b/pkg/applicationmanager/controllers/applicationrestore.go @@ -577,7 +577,7 @@ func (a *ApplicationRestoreController) restoreVolumes(restore *storkapi.Applicat continue } for _, volumeBackup := range backup.Status.Volumes { - if volumeBackup.Namespace != namespace { + if volumeBackup.Namespace != namespace || volumeBackup.Status == storkapi.ApplicationBackupStatusFailed || volumeBackup.Status == storkapi.ApplicationBackupStatusSkip { continue } // If a list of resources was specified during restore check if