controller

package
v0.0.0-...-c6d8f22 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 28, 2026 License: Apache-2.0, Apache-2.0 Imports: 61 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// Condition types
	ConditionTypeEndpointsReady = "EndpointsReady"
	ConditionTypeServicesFound  = "ServicesFound"

	// Condition reasons
	ReasonAllEndpointsReady   = "AllEndpointsReady"
	ReasonEndpointsDiscovered = "EndpointsDiscovered"
	ReasonNotReady            = "NotReady"
	ReasonNoEndpoints         = "NoEndpoints"
	ReasonServicesFound       = "ServicesFound"
	ReasonNoServicesFound     = "NoServicesFound"
)
View Source
const (
	DefaultClusterName                                  = "default"
	DefaultServiceAccountName                           = "default"
	KubeAnnotationDeploymentStrategy                    = "nvidia.com/deployment-strategy"
	KubeAnnotationDeploymentRollingUpdateMaxSurge       = "nvidia.com/deployment-rolling-update-max-surge"
	KubeAnnotationDeploymentRollingUpdateMaxUnavailable = "nvidia.com/deployment-rolling-update-max-unavailable"
)
View Source
const (
	// Job naming
	JobNamePrefixOnline = "profile-online-"
	JobNamePrefixAIC    = "profile-aic-"

	// Container names
	ContainerNameProfiler     = "profiler"
	ContainerNameOutputCopier = "output-copier"

	// ServiceAccount
	ServiceAccountProfilingJob = "dgdr-profiling-job"

	// ConfigMap naming
	ConfigMapOutputPrefix = "dgdr-output-"

	// Annotation keys
	AnnotationAdditionalResources = "dgdr.nvidia.com/additional-resources"

	// Annotation keys for v1alpha1 round-trip compatibility.
	// The conversion layer stores v1alpha1 fields that have no v1beta1 spec equivalent
	// as annotations so the controller can still honour them for converted resources.
	AnnotationConfigMapRef = "nvidia.com/dgdr-config-map-ref"
	AnnotationOutputPVC    = "nvidia.com/dgdr-output-pvc"

	// Size limits
	MaxAnnotationSize = 250000 // ~250KB, below K8s 256KB limit

	// Sidecar image
	SidecarImage = "bitnami/kubectl:latest"

	// Volume names
	VolumeNameProfilingOutput = "profiling-output"
	VolumeNameProfilingConfig = "profiling-config"
	VolumeNameModelCache      = "model-cache"

	// Volume paths
	ProfilingOutputPath        = "/data"
	ProfilingOutputFile        = "final_config.yaml"
	ProfilingConfigMountPath   = "/config"
	ProfilingConfigDefaultKey  = "disagg.yaml"
	DefaultModelCacheMountPath = "/opt/model-cache"

	// Command line arguments
	ArgModel   = "--model"
	ArgBackend = "--backend"
	ArgTTFT    = "--ttft"
	ArgITL     = "--itl"
	ArgConfig  = "--config"

	// Messages
	MessageInitialized               = "DGDR initialized successfully"
	MessageProfilingJobCreated       = "Profiling job created"
	MessageAICProfilingJobCreated    = "AIC profiling job created"
	MessageProfilingInProgress       = "Profiling is in progress"
	MessageSpecGenerated             = "DynamoGraphDeployment spec generated successfully"
	MessageSpecAvailable             = "Generated spec is available in annotation nvidia.com/generated-dgd-spec"
	MessageDeploymentCreated         = "DynamoGraphDeployment %s created successfully"
	MessageDeploymentReady           = "DynamoGraphDeployment %s is ready"
	MessageDeploymentDegraded        = "DynamoGraphDeployment %s degraded from Ready to %s"
	MessageDeploymentDeleted         = "DGD %s was deleted. DGDR will not recreate it. Delete this DGDR and create a new one to redeploy."
	MessageInvalidState              = "Invalid state"
	MessageSpecChangeRejected        = "" /* 151-byte string literal not displayed */
	MessageJobCreationFailed         = "JobCreationFailed"
	MessageDeploymentCreationFailed  = "DeploymentCreationFailed"
	MessageResultsRetrievalFailed    = "ResultsRetrievalFailed"
	MessageGenerationFailed          = "GenerationFailed"
	MessageAIConfiguratorCheckFailed = "AIConfiguratorCheckFailed"
	MessageProfilingCheckFailed      = "ProfilingCheckFailed"
	MessageConfigMapNotFound         = "ConfigMap %s not found in namespace %s"
	MessageConfigMapKeyNotFound      = "key %s not found in ConfigMap %s"
	MessageModelCachePVCNotFound     = "model cache PVC %s not found in namespace %s"
)

Variables

This section is empty.

Functions

func IsDeploymentReady

func IsDeploymentReady(deployment *appsv1.Deployment) bool

IsDeploymentReady determines if a Kubernetes Deployment is fully ready and available. It checks various status fields to ensure all replicas are available and the deployment configuration has been fully applied.

func IsLeaderWorkerSetReady

func IsLeaderWorkerSetReady(leaderWorkerSet *leaderworkersetv1.LeaderWorkerSet) bool

IsLeaderWorkerSetReady determines if a LeaderWorkerSet is fully ready and available

Types

type CheckpointReconciler

type CheckpointReconciler struct {
	client.Client
	Config        *configv1alpha1.OperatorConfiguration
	RuntimeConfig *commonController.RuntimeConfig
	Recorder      record.EventRecorder
}

CheckpointReconciler reconciles a DynamoCheckpoint object

func (*CheckpointReconciler) GetRecorder

func (r *CheckpointReconciler) GetRecorder() record.EventRecorder

GetRecorder returns the event recorder (implements controller_common.Reconciler interface)

func (*CheckpointReconciler) Reconcile

func (r *CheckpointReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

func (*CheckpointReconciler) SetupWithManager

func (r *CheckpointReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type ComponentReconcileResult

type ComponentReconcileResult struct {
	// contains filtered or unexported fields
}

type DynamoComponentDeploymentReconciler

type DynamoComponentDeploymentReconciler struct {
	client.Client
	Recorder              record.EventRecorder
	Config                *configv1alpha1.OperatorConfiguration
	RuntimeConfig         *commonController.RuntimeConfig
	DockerSecretRetriever dockerSecretRetriever
}

DynamoComponentDeploymentReconciler reconciles a DynamoComponentDeployment object

func (*DynamoComponentDeploymentReconciler) FinalizeResource

func (r *DynamoComponentDeploymentReconciler) FinalizeResource(ctx context.Context, dynamoComponentDeployment *v1alpha1.DynamoComponentDeployment) error

func (*DynamoComponentDeploymentReconciler) GetRecorder

func (*DynamoComponentDeploymentReconciler) Reconcile

func (r *DynamoComponentDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)

Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. TODO(user): Modify the Reconcile function to compare the state specified by the DynamoComponentDeployment object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.

For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/reconcile

func (*DynamoComponentDeploymentReconciler) SetupWithManager

func (r *DynamoComponentDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type DynamoGraphDeploymentReconciler

type DynamoGraphDeploymentReconciler struct {
	client.Client
	Config                *configv1alpha1.OperatorConfiguration
	RuntimeConfig         *commoncontroller.RuntimeConfig
	Recorder              record.EventRecorder
	DockerSecretRetriever dockerSecretRetriever
	ScaleClient           scale.ScalesGetter
	MPISecretReplicator   *secret.SecretReplicator
	RBACManager           rbacManager
}

DynamoGraphDeploymentReconciler reconciles a DynamoGraphDeployment object

func (*DynamoGraphDeploymentReconciler) FinalizeResource

func (*DynamoGraphDeploymentReconciler) GetRecorder

func (*DynamoGraphDeploymentReconciler) Reconcile

func (r *DynamoGraphDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error)

Reconcile is part of the main kubernetes reconciliation loop which aims to move the current state of the cluster closer to the desired state. TODO(user): Modify the Reconcile function to compare the state specified by the DynamoGraphDeployment object against the actual cluster state, and then perform operations to make the cluster state reflect the state specified by the user.

For more details, check Reconcile and its Result here: - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/reconcile

func (*DynamoGraphDeploymentReconciler) SetupWithManager

func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager.

type DynamoGraphDeploymentRequestReconciler

type DynamoGraphDeploymentRequestReconciler struct {
	client.Client
	APIReader     client.Reader
	Recorder      record.EventRecorder
	Config        *configv1alpha1.OperatorConfiguration
	RuntimeConfig *commonController.RuntimeConfig

	// RBACMgr handles RBAC setup for profiling jobs
	RBACManager RBACManager
}

DynamoGraphDeploymentRequestReconciler reconciles a DynamoGraphDeploymentRequest object

func (*DynamoGraphDeploymentRequestReconciler) FinalizeResource

FinalizeResource implements commonController.Finalizer interface

func (*DynamoGraphDeploymentRequestReconciler) GetRecorder

GetRecorder implements commonController.Reconciler interface

func (*DynamoGraphDeploymentRequestReconciler) Reconcile

Reconcile handles the reconciliation loop for DynamoGraphDeploymentRequest

func (*DynamoGraphDeploymentRequestReconciler) SetupWithManager

func (r *DynamoGraphDeploymentRequestReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager

type DynamoGraphDeploymentScalingAdapterReconciler

type DynamoGraphDeploymentScalingAdapterReconciler struct {
	client.Client
	Scheme        *runtime.Scheme
	Recorder      record.EventRecorder
	Config        *configv1alpha1.OperatorConfiguration
	RuntimeConfig *commonController.RuntimeConfig
}

DynamoGraphDeploymentScalingAdapterReconciler reconciles a DynamoGraphDeploymentScalingAdapter object

func (*DynamoGraphDeploymentScalingAdapterReconciler) Reconcile

Reconcile implements the reconciliation loop for DynamoGraphDeploymentScalingAdapter

func (*DynamoGraphDeploymentScalingAdapterReconciler) SetupWithManager

SetupWithManager sets up the controller with the Manager

type DynamoModelReconciler

type DynamoModelReconciler struct {
	client.Client
	Recorder       record.EventRecorder
	EndpointClient *modelendpoint.Client
	Config         *configv1alpha1.OperatorConfiguration
	RuntimeConfig  *commoncontroller.RuntimeConfig
}

DynamoModelReconciler reconciles a DynamoModel object

func (*DynamoModelReconciler) FinalizeResource

func (r *DynamoModelReconciler) FinalizeResource(ctx context.Context, model *v1alpha1.DynamoModel) error

FinalizeResource implements the Finalizer interface Performs cleanup when a DynamoModel is being deleted

func (*DynamoModelReconciler) Reconcile

func (r *DynamoModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error)

Reconcile handles the reconciliation loop for DynamoModel resources

func (*DynamoModelReconciler) SetupWithManager

func (r *DynamoModelReconciler) SetupWithManager(mgr ctrl.Manager) error

SetupWithManager sets up the controller with the Manager

type Message

type Message string

type RBACManager

type RBACManager interface {
	EnsureServiceAccountWithRBAC(ctx context.Context, targetNamespace, serviceAccountName, clusterRoleName string) error
}

RBACManager interface for managing RBAC resources

type Reason

type Reason string

type ReconcileResult

type ReconcileResult struct {
	State         nvidiacomv1alpha1.DGDState
	Reason        Reason
	Message       Message
	ServiceStatus map[string]nvidiacomv1alpha1.ServiceReplicaStatus
	RestartStatus *nvidiacomv1alpha1.RestartStatus
}

type Resource

type Resource interface {
	IsReady() (ready bool, reason string)
	GetName() string
	GetServiceStatuses() map[string]nvidiacomv1alpha1.ServiceReplicaStatus
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL