Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
8712b73
add continue attribute for observability service alert config
PatrickKoss Sep 15, 2025
7ef4213
adjust the acceptance test observability
PatrickKoss Sep 15, 2025
702122b
adjust docs
PatrickKoss Sep 15, 2025
56eb265
add continue in another case
PatrickKoss Sep 17, 2025
b405ce7
Merge branch 'main' into main
PatrickKoss Sep 18, 2025
188f0b7
Merge branch 'stackitcloud:main' into main
PatrickKoss Sep 19, 2025
13fdd53
remove continue attribute from root
PatrickKoss Sep 19, 2025
113bbb9
fix acc test
PatrickKoss Sep 19, 2025
e073ec2
Merge branch 'stackitcloud:main' into main
PatrickKoss Sep 19, 2025
8118f17
fix docs
PatrickKoss Sep 19, 2025
3e1a403
fix unit tests
PatrickKoss Sep 19, 2025
039719f
remove route types
PatrickKoss Sep 19, 2025
6ffe516
Merge branch 'main' into main
rubenhoenle Sep 19, 2025
e74f9f8
Merge branch 'stackitcloud:main' into main
PatrickKoss Oct 29, 2025
4e99f0d
Merge branch 'stackitcloud:main' into main
PatrickKoss Nov 3, 2025
55183c5
Merge branch 'stackitcloud:main' into main
PatrickKoss Nov 10, 2025
ee3a0c8
add skip wait and set partial model
PatrickKoss Nov 10, 2025
76fc503
fix linting errors
PatrickKoss Nov 10, 2025
de09817
revert formatting
PatrickKoss Nov 11, 2025
e7649c2
revert formatting
PatrickKoss Nov 11, 2025
037cece
import state
PatrickKoss Nov 11, 2025
265836f
downlint lint from releases + remove read id check
PatrickKoss Nov 12, 2025
ba8ecc8
Merge branch 'main' into feature/dns-skip-wait
PatrickKoss Nov 12, 2025
1196efb
fix pipeline linting
PatrickKoss Nov 12, 2025
50f1f37
adjust SetModelFieldsToNull to handle complex objects and lists
PatrickKoss Nov 13, 2025
873f875
fix linting
PatrickKoss Nov 13, 2025
6e89bf9
fix linting
PatrickKoss Nov 13, 2025
b769ba1
add dns wait warn log for tf idempotency
PatrickKoss Nov 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
ROOT_DIR ?= $(shell git rev-parse --show-toplevel)
SCRIPTS_BASE ?= $(ROOT_DIR)/scripts
BIN_DIR ?= $(ROOT_DIR)/bin

# https://github.com/golangci/golangci-lint/releases
GOLANGCI_LINT_VERSION = 1.64.8
GOLANGCI_LINT = $(BIN_DIR)/golangci-lint

# SETUP AND TOOL INITIALIZATION TASKS
project-help:
Expand All @@ -8,10 +13,14 @@ project-help:
project-tools:
@$(SCRIPTS_BASE)/project.sh tools

# GOLANGCI-LINT INSTALLATION
$(GOLANGCI_LINT):
@GOLANGCI_LINT_VERSION=$(GOLANGCI_LINT_VERSION) $(SCRIPTS_BASE)/install-golangci-lint.sh

# LINT
lint-golangci-lint:
lint-golangci-lint: $(GOLANGCI_LINT)
@echo "Linting with golangci-lint"
@$(SCRIPTS_BASE)/lint-golangci-lint.sh
@$(SCRIPTS_BASE)/lint-golangci-lint.sh $(GOLANGCI_LINT)

lint-tf:
@echo "Linting terraform files"
Expand Down
42 changes: 42 additions & 0 deletions scripts/install-golangci-lint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash
set -e
. $(dirname ${0})/utility.sh

BINARY_NAME=golangci-lint
INSTALL_TO=${BIN_DIR}/${BINARY_NAME}

install() {
echo " installing ${BINARY_NAME} ${GOLANGCI_LINT_VERSION}"

TYPE=windows
if [[ "${OSTYPE}" == linux* ]]; then
TYPE=linux
elif [[ "${OSTYPE}" == darwin* ]]; then
TYPE=darwin
fi

case $(uname -m) in
arm64|aarch64)
ARCH=arm64
;;
*)
ARCH=amd64
;;
esac

BASE_URL=https://github.com/golangci/golangci-lint/releases/download/v${GOLANGCI_LINT_VERSION}
URL=${BASE_URL}/golangci-lint-${GOLANGCI_LINT_VERSION}-${TYPE}-${ARCH}.tar.gz
echo " Downloading: ${URL}"
download ${URL} | tar --extract --gzip --strip-components 1 --preserve-permissions -C ${BIN_DIR} -f-

# Ensure the binary has the correct name
if [ -f "${BIN_DIR}/golangci-lint" ] && [ "${BIN_DIR}/golangci-lint" != "${INSTALL_TO}" ]; then
mv "${BIN_DIR}/golangci-lint" "${INSTALL_TO}"
fi
}

get_version() {
${INSTALL_TO} version 2>/dev/null | awk '{print $4}'
}

update_if_necessary ${GOLANGCI_LINT_VERSION}
13 changes: 7 additions & 6 deletions scripts/lint-golangci-lint.sh
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
#!/usr/bin/env bash
# This script lints the SDK modules and the internal examples
# Pre-requisites: golangci-lint
# Pre-requisites: golangci-lint (provided by Makefile or system)
set -eo pipefail

ROOT_DIR=$(git rev-parse --show-toplevel)
GOLANG_CI_YAML_PATH="${ROOT_DIR}/golang-ci.yaml"
GOLANG_CI_ARGS="--allow-parallel-runners --timeout=5m --config=${GOLANG_CI_YAML_PATH}"

if type -p golangci-lint >/dev/null; then
:
else
echo "golangci-lint not installed, unable to proceed."
# Use provided golangci-lint binary or fallback to system installation
GOLANGCI_LINT_BIN="${1:-golangci-lint}"

if [ ! -x "${GOLANGCI_LINT_BIN}" ] && ! type -p "${GOLANGCI_LINT_BIN}" >/dev/null; then
echo "golangci-lint not found at ${GOLANGCI_LINT_BIN} and not installed in PATH, unable to proceed."
exit 1
fi

cd ${ROOT_DIR}
golangci-lint run ${GOLANG_CI_ARGS}
${GOLANGCI_LINT_BIN} run ${GOLANG_CI_ARGS}
46 changes: 46 additions & 0 deletions scripts/utility.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash
# Common utility functions for tool installation scripts

ROOT_DIR=$(git rev-parse --show-toplevel)
BIN_DIR="${ROOT_DIR}/bin"

# Ensure bin directory exists
mkdir -p "${BIN_DIR}"

# Download function using curl
download() {
local URL=$1
if command -v curl &> /dev/null; then
curl -sSfL "${URL}"
elif command -v wget &> /dev/null; then
wget -qO- "${URL}"
else
echo "Error: Neither curl nor wget found. Please install one of them."
exit 1
fi
}

# Update tool if necessary
update_if_necessary() {
local EXPECTED_VERSION=$1

if [ -x "${INSTALL_TO}" ]; then
CURRENT_VERSION=$(get_version 2>/dev/null || echo "")
if [ "${CURRENT_VERSION}" = "${EXPECTED_VERSION}" ]; then
echo " ${BINARY_NAME} ${EXPECTED_VERSION} already installed"
return 0
else
echo " ${BINARY_NAME} version mismatch (current: ${CURRENT_VERSION}, expected: ${EXPECTED_VERSION})"
echo " updating to ${EXPECTED_VERSION}..."
fi
fi

install

INSTALLED_VERSION=$(get_version 2>/dev/null || echo "unknown")
if [ "${INSTALLED_VERSION}" = "${EXPECTED_VERSION}" ]; then
echo " ${BINARY_NAME} ${EXPECTED_VERSION} installed successfully"
else
echo " Warning: installed version (${INSTALLED_VERSION}) does not match expected version (${EXPECTED_VERSION})"
fi
}
79 changes: 66 additions & 13 deletions stackit/internal/services/dns/recordset/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package dns

import (
"context"
"errors"
"fmt"
"net/http"
"strings"

"github.com/hashicorp/terraform-plugin-framework-validators/int64validator"
Expand All @@ -16,6 +18,7 @@ import (
"github.com/hashicorp/terraform-plugin-framework/schema/validator"
"github.com/hashicorp/terraform-plugin-framework/types"
"github.com/hashicorp/terraform-plugin-log/tflog"
"github.com/stackitcloud/stackit-sdk-go/core/oapierror"
"github.com/stackitcloud/stackit-sdk-go/services/dns"
"github.com/stackitcloud/stackit-sdk-go/services/dns/wait"
"github.com/stackitcloud/terraform-provider-stackit/stackit/internal/conversion"
Expand Down Expand Up @@ -219,18 +222,30 @@ func (r *recordSetResource) Create(ctx context.Context, req resource.CreateReque
}

// Write id attributes to state before polling via the wait handler - just in case anything goes wrong during the wait handler
utils.SetAndLogStateFields(ctx, &resp.Diagnostics, &resp.State, map[string]any{
"project_id": projectId,
"zone_id": zoneId,
"record_set_id": *recordSetResp.Rrset.Id,
})
recordSetId := *recordSetResp.Rrset.Id
model.RecordSetId = types.StringValue(recordSetId)
model.Id = utils.BuildInternalTerraformId(projectId, zoneId, recordSetId)

// Set all unknown/null fields to null before saving state
if err := utils.SetModelFieldsToNull(ctx, &model); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, but I just don't get why one would want to have this. What's the point of this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be because of weird clients. Currently we only set project_id and zone_id in the state before waiting. This lead to the following error if the waiting is skipped which some clients want:

"error": "cannot get a terraform workspace for resource: cannot ensure tfstate file: cannot check whether the state is empty: cannot work with a non-string id: <nil>", "errorVerbose": "cannot work with a non-string id: <nil>

So I have set the id as well in the helper function. I think I observed an error in the past that the client got an error because some field in the state were "unknown". But I can no longer find this error message anymore. So currently I get:

apply failed: Provider produced inconsistent result after apply: When applying changes to stackit_dns_zone.example-zone, provider "provider[\"registry.terraform.io/stackitcloud/stackit\"]" produced an unexpected new value: .description: was cty.StringVal("Example DNS zone for demonstration"), but now null.

      This is a bug in the provider, which should be reported in the provider's own issue tracker.
      Provider produced inconsistent result after apply: When applying changes to stackit_dns_zone.example-zone, provider "provider[\"registry.terraform.io/stackitcloud/stackit\"]" produced an unexpected new value: .dns_name: was cty.StringVal("patrick.test.patrick.patrick"), but now null.

      This is a bug in the provider, which should be reported in the provider's own issue tracker.
      Provider produced inconsistent result after apply: When applying changes to stackit_dns_zone.example-zone, provider "provider[\"registry.terraform.io/stackitcloud/stackit\"]" produced an unexpected new value: .name: was cty.StringVal("example-zone"), but now null.

      This is a bug in the provider, which should be reported in the provider's own issue tracker.
      Provider produced inconsistent result after apply: When applying changes to stackit_dns_zone.example-zone, provider "provider[\"registry.terraform.io/stackitcloud/stackit\"]" produced an unexpected new value: .is_reverse_zone: was cty.False, but now null.

      This is a bug in the provider, which should be reported in the provider's own issue tracker.
      Provider produced inconsistent result after apply: When applying changes to stackit_dns_zone.example-zone, provider "provider[\"registry.terraform.io/stackitcloud/stackit\"]" produced an unexpected new value: .type: was cty.StringVal("primary"), but now null.

      This is a bug in the provider, which should be reported in the provider's own issue tracker.

Because of it the client wants to destroy the resource:

"error": "cannot run plan: plan failed: Instance cannot be destroyed: Resource stackit_dns_zone.example-zone has lifecycle.prevent_destroy set, but the plan calls for this resource to be destroyed. To avoid this error and continue with the plan, either disable lifecycle.prevent_destroy or reduce the scope of the plan using the -target flag.", "errorVerbose": "plan failed: Instance cannot be destroyed: Resource stackit_dns_zone.example-zone has lifecycle.prevent_destroy set, but the plan calls for this resource to be destroyed. To avoid this error and continue with the plan, either disable lifecycle.prevent_destroy or reduce the scope of the plan using the -target flag.

For some reason if you set the fields to null instead of unknown the client accepts it and proceeds correctly. Maybe we need to take a look together into the topic. If you have some better ways to handle this case feel free to suggest :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to make sure I don't mess things up here, what do you mean with client?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

crossplane+upjet that then executes terraform cli commands

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image

Messages like this are there for a reason by Terraform. You would break this behavior with this change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect that is a problem with complex objects, list of lists and list of complex objects in the utils function SetModelFieldsToNull.
I also tried adding the same logic as in zone to iaas network and added alot of unit tests to provoke the error and couldn´t reproduce. You can check it here if you want.

Can you provide the input parameters so I can add unit tests for this case to verify if it happens in the implementation or not?
Additionally you can check with in your setup as well if the added functionality resolves the issue.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or do you imply that it is perfectly fine to have errors? because if we want to use upjet to generate a crossplane provider we cannot accept such error since it simply does not work :D

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or do you imply that it is perfectly fine to have errors?

Clear no.

because if we want to use upjet to generate a crossplane provider we cannot accept such error since it simply does not work :D

Well, I guess it doesn't work because you modified the code of the terraform provider and didn't understand the impacts of your changes.

I have to start from scratch here: Unknown values are a core concept of Terraform (see https://developer.hashicorp.com/terraform/plugin/framework/handling-data/terraform-concepts#unknown-values). Unknown values are important for Terraform to apply resources in the correct order, ...

But what does this mean for us? After a terraform apply run which creates a new resource, all fields of the resource must be set by the Terraform provider to a value or to null explicitly. If this isn't done for a field of the resource, you will get a message like this:

image

Whenever you get a message like this it's clear that this is a bug in the Terraform provider. And I'm going to lean myself out of the window here and say this doesn't happen for the stackit_dns_record_set resource on the main branch of our STACKIT Terraform provider repository. 😄


Let me explain why

We create the resource on API side and then use the wait handler.

recordSetResp, err := r.client.CreateRecordSet(ctx, projectId, zoneId).CreateRecordSetPayload(*payload).Execute()
if err != nil || recordSetResp.Rrset == nil || recordSetResp.Rrset.Id == nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating record set", fmt.Sprintf("Calling API: %v", err))
return
}
// Write id attributes to state before polling via the wait handler - just in case anything goes wrong during the wait handler
utils.SetAndLogStateFields(ctx, &resp.Diagnostics, &resp.State, map[string]any{
"project_id": projectId,
"zone_id": zoneId,
"record_set_id": *recordSetResp.Rrset.Id,
})
if resp.Diagnostics.HasError() {
return
}
waitResp, err := wait.CreateRecordSetWaitHandler(ctx, r.client, projectId, zoneId, *recordSetResp.Rrset.Id).WaitWithContext(ctx)
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating record set", fmt.Sprintf("Instance creation waiting: %v", err))
return
}

After the wait handler we use the mapFields function to map the API response to the Terraform state model.

// Map response body to schema
err = mapFields(ctx, waitResp, &model)
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating record set", fmt.Sprintf("Processing API payload: %v", err))
return
}
// Set state to fully populated data
diags = resp.State.Set(ctx, model)
resp.Diagnostics.Append(diags...)
if resp.Diagnostics.HasError() {
return
}

Now comes the important part: Here is the section in the mapFields function, which makes sure all fields of the resource get set to a value or null. [1]

model.Id = utils.BuildInternalTerraformId(
model.ProjectId.ValueString(), model.ZoneId.ValueString(), recordSetId,
)
model.RecordSetId = types.StringPointerValue(recordSet.Id)
model.Active = types.BoolPointerValue(recordSet.Active)
model.Comment = types.StringPointerValue(recordSet.Comment)
model.Error = types.StringPointerValue(recordSet.Error)
if model.Name.IsNull() || model.Name.IsUnknown() {
model.Name = types.StringPointerValue(recordSet.Name)
}
model.FQDN = types.StringPointerValue(recordSet.Name)
model.State = types.StringValue(string(recordSet.GetState()))
model.TTL = types.Int64PointerValue(recordSet.Ttl)
model.Type = types.StringValue(string(recordSet.GetType()))

Well, and after that the model struct must be persisted in the Terraform state (this doesn't happen automatically):

// Set state to fully populated data
diags = resp.State.Set(ctx, model)
resp.Diagnostics.Append(diags...)
if resp.Diagnostics.HasError() {
return
}

To sum it up, here's what happens in the main branch implementation of this resource:

  1. Create request for the API resource
  2. (Write id fields to the state in case anything goes wrong during the wait handler)
  3. Wait handler to wait for creation of the API resource to complete
  4. Map API response to Terraform resource model struct (mapFields)
  5. Persist the Terraform model struct of the resource in the Terraform state

Now to your changes

Now to your changes and why it's not working (without setting all fields to null using your new reflection-powered util func):

In your func (r *recordSetResource) Create(...) ... implementation...

  • You also do the Create request for the API resource (see no. 1 above)
  • You write the id fields to the state (see no 2. above)
  • And then you jump out of the Create implementation of the Terraform resource prematurely with the code below.
	if !utils.ShouldWait() {
		tflog.Info(ctx, "Skipping wait; async mode for Crossplane/Upjet")
		return
	}

The problem is: This doesn't only skip the wait handler (no. 3 above), but also the mapFields func call (no. 4 above) which (as said) sets explicitly all values to a value or null.

Again, you just skip this. This is a core part of the resource implementation. You don't call it. That's why Terraform complains about unknown values. Terraform says this is a bug in the provider implementation, and it's correct.

But it's sadly not a bug in our implementation on the main branch, but in your implementation.

You circumvent this problem by setting all fields of the Terraform resource state model explicitly to null by using your new util func. This circumvents the problem (Terraform doesn't complain anymore about unknown values), but it doesn't really fix the problem (at least not in a clean way).

In fact setting all fields of the Terraform resource model struct to null circumvents existing checks of Terraform which we want to take advantage of during our resource implementations (at least for pure Terraform usage, without thinking of crossplane here).


[1] Btw, if you forget to set one field of the Terraform resource model struct to a value of null here during the implementation of the Terraform resource you will also get exactly the error After the apply operation, the provider still indicated an unknown value... from above. This is what I consider a terraform feature. As said, unknown values are a concept of Terraform

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the detailed explanation. It covers well my observations. I think we are actually on two sides of the same coin.

Let´s take a step and start with the requirements for the create again, then I share my observations during testing and then check different alternatives.

Requirements

  • Have idempotency. If I apply a resource and somehow fail right after the api call (for example due to timeouts, context cancels, random api errors in the wait handler) I want the resource to be in the state and use a Read to fill the model in the next apply. There should be no state drifts or replacements of the resource created in the first apply.
  • Have a way to return right after the creation of the resource without waiting. This comes from upjet/crossplane (therefore the skip method with the log that it is intended to only use by this tool). Upjet needs the ids of the resources quite fast to persist them in kubernetes custom resources (the database so to say). Because the resource is only known once stored in the custom resource. It does hold the terraform state in a file only temporary. During applies the state is constructed with the custom resource. The problem with the waiting here is that the controller executing terraform can restart in any point in time. Since the wait handler can take quite a bit of time we risk creating the same resource twice. Therefore the early return. And it is completely fine for the tool since it executes a Read directly after the return. Every 10 min it queries the state of the cloud resource as well. So eventually it will reach the point where the cloud resource and the custom resource have the same data. That´s the standard kubernetes reconciling mechanism.
  • (optional) have a common way to achieve idempotency in every single resource. We should have a rock solid way without much custom implementation as it is error prone to do it for each resource.

Code Walkthrough, Testing and Observations

We already recognized that we need to set partial states in the terraform state. That´s why the following code already exists in the main branch:

	utils.SetAndLogStateFields(ctx, &resp.Diagnostics, &resp.State, map[string]any{
		"project_id":    projectId,
		"zone_id":       zoneId,
		"record_set_id": *recordSetResp.Rrset.Id,
	})
	if resp.Diagnostics.HasError() {
		return
	}

https://github.com/PatrickKoss/terraform-provider-stackit/blob/main/stackit/internal/services/dns/recordset/resource.go#L222-L229

In my tests I have setup a terraform resource (in this case mariadb with the same function as mariadb takes way longer to create and dns is super fast. So please don´t be confused about the resource we are still talking about the same code)

resource "stackit_mariadb_instance" "example_maria_db" {
  name       = "example-mariadb"
  plan_name  = "stackit-mariadb-1.4.10-single"
  project_id = "xxx"
  version    = "10.6"
}

Then I applied and once the wait handler started and I saw mariadb in creating state in the portal I canceled the apply to simulate random failures as mentioned above. Then I reapplied and got the error: stackit_mariadb_instance.example_maria_db is tainted, so must be replaced.

That´s when I recognized that setting ids is not enough and we need to include the fields in the resource as well (name, plane_name,version). So I changed the code to:

	utils.SetAndLogStateFields(ctx, &resp.Diagnostics, &resp.State, map[string]interface{}{
		"project_id":  projectId,
		"instance_id": model.InstanceId.ValueString(),
		"id":          model.Id.ValueString(),
		"name":        model.Name.ValueString(),
		"plan_name":   model.PlanName.ValueString(),
		"version":     model.Version.ValueString(),
		"plan_id":     model.PlanId.ValueString(),
	})
	if resp.Diagnostics.HasError() {
		return
	}

and that almost worked. We also should not log and error in the wait handler as it messes up terraform and result in non idempotent behaviour:

	waitResp, err := wait.CreateInstanceWaitHandler(ctx, r.client, projectId, instanceId).WaitWithContext(ctx)
	if err != nil {
		tflog.Warn(ctx, fmt.Sprintf("Instance creation waiting failed: %v. The instance was created but waiting for ready state was interrupted. State will be refreshed on next apply.", err))
		return
	}

And that works perfectly fine in the case of create/cancel/reapply. Now there are no state drift and the resource stays as it is.

Now I went a step further and wrote unit tests for the behaviour. So we can really verify that it works how we think it works.

	// Verify that Read successfully populated all fields from the API
	var stateAfterRead Model
	diags = readResp.State.Get(tc.Ctx, &stateAfterRead)
	require.False(t, diags.HasError(), "Expected no errors reading state after Read")

	// Verify all fields are now complete after successful Read (prevents state drift)
	require.Equal(t, instanceId, stateAfterRead.InstanceId.ValueString())
	require.Equal(t, fmt.Sprintf("%s,%s", projectId, instanceId), stateAfterRead.Id.ValueString())
	require.Equal(t, projectId, stateAfterRead.ProjectId.ValueString())
	require.Equal(t, instanceName, stateAfterRead.Name.ValueString())
	require.Equal(t, planId, stateAfterRead.PlanId.ValueString())
	require.Equal(t, planName, stateAfterRead.PlanName.ValueString())
	require.Equal(t, version, stateAfterRead.Version.ValueString())

	// CRITICAL: Verify fields that were NULL after Create are now populated
	// This prevents Terraform state drift on the next apply
	require.False(t, stateAfterRead.DashboardUrl.IsNull(), "DashboardUrl must be populated by Read to prevent state drift")
	require.Equal(t, dashboardUrl, stateAfterRead.DashboardUrl.ValueString())
	require.False(t, stateAfterRead.CfGuid.IsNull(), "CfGuid must be populated by Read to prevent state drift")
	require.False(t, stateAfterRead.ImageUrl.IsNull(), "ImageUrl must be populated by Read to prevent state drift")

https://github.com/PatrickKoss/terraform-provider-stackit/blob/feature/cp-enhancements/stackit/internal/services/mariadb/instance/resource_create_test.go#L16-L157

The unit test covers the manual test create/cancel/read. Note that setting the partial state actually leads to null fields while reading the state again. Then I inserted the utils.SetModelFieldsToNull instead of utils.SetAndLogStateFields and the test(s) were equally successful. This lead me to the assumption we are actually on two different sides of the same coin (different code but same outcome). Not setting fields in the state leads to null values while setting them to null explicitly also result in reading out null values. So we probably found out multiple ways to solve the idempotency problem. More in the alternatives.

Second the early exit is this code:

	if !utils.ShouldWait() {
		tflog.Info(ctx, "Skipping wait; async mode for Crossplane/Upjet")
		return
	}

Note this function is only executed if an environment variable is set to "true". If the variable is not set or to any other value than "true" we would continue with the wait handler. Not pretty but we somehow need to cover the requirement since the tool works as it works.

Alternatives/Conclusion

I think there is no real discussion about the early return but if there is feel free to suggest something.

The more interesting point is the idempotency part.

  • As we already saw in the tests we need to set the ids and the fields specified in the resource to avoid state drift and resource recreation. One approach could be like the current one in the main branch but make it a bit more abstract. We can construct the map based on the map. Similiar to the proposed implementation utils.SetModelFieldsToNull we can iterate over the models attributes with reflection magic check for non null/unknown fields and use the tags (tfsdk) of the model as keys for the map and the value of the attribute of the model. This should result in the map we want to store as partial state in the terraform state.
  • Similiar to the first approach we can go the reverse approach and have the model already set and then set all fields to null that are unknown. That´s also the proposed approach. You highlighted correctly that it might not be the best idea to use the same model that is used after the wait handler as we also want to verify the behaviour of the map function after the wait handler. Means we should do a deepcopy and set the model fields on this deepcopy to null and also save the deepcopy in the struct.
  • One last approach that I could came up with is the construction of the map with a lot of if-conditions in the resource without any reflection magic. That is the least preferred option as it requires implementing it in every resource and is error prone since we may miss fields. (That´s what I mean in the third requirement)

So, what do you think? Do you have other testing experiences? Which direction should we go?

core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating record set", fmt.Sprintf("Setting model fields to null: %v", err))
return
}

diags = resp.State.Set(ctx, model)
resp.Diagnostics.Append(diags...)
if resp.Diagnostics.HasError() {
return
}

if !utils.ShouldWait() {
tflog.Info(ctx, "Skipping wait; async mode for Crossplane/Upjet")
return
}

waitResp, err := wait.CreateRecordSetWaitHandler(ctx, r.client, projectId, zoneId, *recordSetResp.Rrset.Id).WaitWithContext(ctx)
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating record set", fmt.Sprintf("Instance creation waiting: %v", err))
tflog.Warn(ctx, fmt.Sprintf("Record set creation waiting failed: %v. The record set creation was triggered but waiting for completion was interrupted. The record set may still be creating.", err))
return
}

Expand Down Expand Up @@ -266,6 +281,12 @@ func (r *recordSetResource) Read(ctx context.Context, req resource.ReadRequest,

recordSetResp, err := r.client.GetRecordSet(ctx, projectId, zoneId, recordSetId).Execute()
if err != nil {
var oapiErr *oapierror.GenericOpenAPIError
ok := errors.As(err, &oapiErr)
if ok && (oapiErr.StatusCode == http.StatusNotFound || oapiErr.StatusCode == http.StatusGone) {
resp.State.RemoveResource(ctx)
return
}
core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading record set", fmt.Sprintf("Calling API: %v", err))
return
}
Expand Down Expand Up @@ -319,9 +340,15 @@ func (r *recordSetResource) Update(ctx context.Context, req resource.UpdateReque
core.LogAndAddError(ctx, &resp.Diagnostics, "Error updating record set", err.Error())
return
}

if !utils.ShouldWait() {
tflog.Info(ctx, "Skipping wait; async mode for Crossplane/Upjet")
return
}

waitResp, err := wait.PartialUpdateRecordSetWaitHandler(ctx, r.client, projectId, zoneId, recordSetId).WaitWithContext(ctx)
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error updating record set", fmt.Sprintf("Instance update waiting: %v", err))
tflog.Warn(ctx, fmt.Sprintf("Record set update waiting failed: %v. The record set update was triggered but waiting for completion was interrupted. The record set may still be updating.", err))
return
}

Expand Down Expand Up @@ -358,11 +385,25 @@ func (r *recordSetResource) Delete(ctx context.Context, req resource.DeleteReque
// Delete existing record set
_, err := r.client.DeleteRecordSet(ctx, projectId, zoneId, recordSetId).Execute()
if err != nil {
// If resource is already gone (404 or 410), treat as success for idempotency
var oapiErr *oapierror.GenericOpenAPIError
ok := errors.As(err, &oapiErr)
if ok && (oapiErr.StatusCode == http.StatusNotFound || oapiErr.StatusCode == http.StatusGone) {
tflog.Info(ctx, "Record set already deleted")
return
}
core.LogAndAddError(ctx, &resp.Diagnostics, "Error deleting record set", fmt.Sprintf("Calling API: %v", err))
return
}

if !utils.ShouldWait() {
tflog.Info(ctx, "Skipping wait; async mode for Crossplane/Upjet")
return
}

_, err = wait.DeleteRecordSetWaitHandler(ctx, r.client, projectId, zoneId, recordSetId).WaitWithContext(ctx)
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error deleting record set", fmt.Sprintf("Instance deletion waiting: %v", err))
tflog.Warn(ctx, fmt.Sprintf("Record set deletion waiting failed: %v. The record set deletion was triggered but waiting for completion was interrupted. The record set may still be deleting.", err))
return
}
tflog.Info(ctx, "DNS record set deleted")
Expand All @@ -380,11 +421,23 @@ func (r *recordSetResource) ImportState(ctx context.Context, req resource.Import
return
}

utils.SetAndLogStateFields(ctx, &resp.Diagnostics, &resp.State, map[string]interface{}{
"project_id": idParts[0],
"zone_id": idParts[1],
"record_set_id": idParts[2],
})
var model Model
model.ProjectId = types.StringValue(idParts[0])
model.ZoneId = types.StringValue(idParts[1])
model.RecordSetId = types.StringValue(idParts[2])
model.Id = utils.BuildInternalTerraformId(idParts[0], idParts[1], idParts[2])

if err := utils.SetModelFieldsToNull(ctx, &model); err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error importing zone", fmt.Sprintf("Setting model fields to null: %v", err))
return
}

diags := resp.State.Set(ctx, model)
resp.Diagnostics.Append(diags...)
if diags.HasError() {
return
}

tflog.Info(ctx, "DNS record set state imported")
}

Expand Down
Loading
Loading