cleanup syn logic in app

This commit is contained in:
2026-06-25 21:27:38 +00:00
parent 759b38e68d
commit ab6a03be3b
3 changed files with 139 additions and 116 deletions
+13 -44
View File
@@ -36,7 +36,7 @@ func Run() {
cluster.Init(client) cluster.Init(client)
start := time.Now() start := time.Now()
log.Printf("[INFO] starting cluster sync\n") log.Printf("[INFO] starting cluster sync\n")
err := cluster.Sync() err := SyncCluster(&cluster)
if err != nil { if err != nil {
log.Printf("[Error] error encountered while syncing cluster: %s", err) log.Printf("[Error] error encountered while syncing cluster: %s", err)
} else { } else {
@@ -55,7 +55,7 @@ func Run() {
case <-ticker.C: case <-ticker.C:
start := time.Now() start := time.Now()
log.Printf("[INFO] starting cluster sync\n") log.Printf("[INFO] starting cluster sync\n")
err := cluster.Sync() err := SyncCluster(&cluster)
if err != nil { if err != nil {
log.Printf("[ERR ] error encountered while syncing cluster: %s", err) log.Printf("[ERR ] error encountered while syncing cluster: %s", err)
} else { } else {
@@ -139,7 +139,7 @@ func Run() {
router.POST("/sync", func(c *gin.Context) { router.POST("/sync", func(c *gin.Context) {
start := time.Now() start := time.Now()
log.Printf("[INFO] starting cluster sync\n") log.Printf("[INFO] starting cluster sync\n")
err := cluster.Sync() err := SyncCluster(&cluster)
if err != nil { if err != nil {
log.Printf("[ERR ] failed to sync cluster: %s", err) log.Printf("[ERR ] failed to sync cluster: %s", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
@@ -151,64 +151,33 @@ func Run() {
}) })
router.POST("/nodes/:node/sync", func(c *gin.Context) { router.POST("/nodes/:node/sync", func(c *gin.Context) {
nodeid := c.Param("node") nodeName := c.Param("node")
start := time.Now() start := time.Now()
log.Printf("[INFO] starting %s sync\n", nodeid) log.Printf("[INFO] starting %s sync\n", nodeName)
err := cluster.RebuildNode(nodeid) err := SyncNode(&cluster, nodeName)
if err != nil { if err != nil {
log.Printf("[ERR ] failed to sync %s: %s", nodeid, err.Error()) log.Printf("[ERR ] failed to sync %s: %s", nodeName, err.Error())
c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) c.JSON(http.StatusNotFound, gin.H{"error": err.Error()})
}
// after synchronizing a node, resync pool membership
err = cluster.ResolvePoolMembership()
if err != nil {
log.Printf("[ERR ] failed to sync %s: %s", nodeid, err.Error())
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
} else { } else {
log.Printf("[INFO] synced %s in %fs\n", nodeid, time.Since(start).Seconds()) log.Printf("[INFO] synced %s in %fs\n", nodeName, time.Since(start).Seconds())
} }
}) })
router.POST("/nodes/:node/instances/:vmid/sync", func(c *gin.Context) { router.POST("/nodes/:node/instances/:vmid/sync", func(c *gin.Context) {
nodeid := c.Param("node") nodeName := c.Param("node")
vmid, err := strconv.ParseUint(c.Param("vmid"), 10, 64) vmid, err := strconv.ParseUint(c.Param("vmid"), 10, 64)
if err != nil { if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Sprintf("%s could not be converted to vmid (uint)", c.Param("instance"))}) c.JSON(http.StatusInternalServerError, gin.H{"error": fmt.Sprintf("%s could not be converted to vmid (uint)", c.Param("instance"))})
return return
} }
//go func() {
start := time.Now() start := time.Now()
log.Printf("[INFO] starting %s.%d sync\n", nodeid, vmid) log.Printf("[INFO] starting %s.%d sync\n", nodeName, vmid)
err = SyncInstance(&cluster, nodeName, uint(vmid))
node, err := cluster.GetNode(nodeid)
if err != nil { if err != nil {
log.Printf("[ERR ] failed to sync %s.%d: %s", nodeid, vmid, err.Error()) log.Printf("[ERR ] failed to sync %s.%d: %s", nodeName, vmid, err.Error())
c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) c.JSON(http.StatusNotFound, gin.H{"error": err.Error()})
return
}
instance, err := node.GetInstance(uint(vmid))
if err != nil {
log.Printf("[ERR ] failed to sync %s.%d: %s", nodeid, vmid, err.Error())
c.JSON(http.StatusNotFound, gin.H{"error": err.Error()})
return
}
err = node.RebuildInstance(instance.Type, uint(vmid))
if err != nil {
log.Printf("[ERR ] failed to sync %s.%d: %s", nodeid, vmid, err.Error())
c.JSON(http.StatusNotFound, gin.H{"error": err.Error()})
}
// after synchronizing a node, resync pool membership
err = cluster.ResolvePoolMembership()
if err != nil {
log.Printf("[ERR ] failed to sync %s: %s", nodeid, err.Error())
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
} else { } else {
log.Printf("[INFO] synced %s in %fs\n", nodeid, time.Since(start).Seconds()) log.Printf("[INFO] synced %s in %fs\n", nodeName, time.Since(start).Seconds())
} }
}) })
+124 -72
View File
@@ -27,12 +27,28 @@ func (cluster *Cluster) Get() (*Cluster, error) {
} }
} }
func SyncCluster(cluster *Cluster) error {
cluster.OK = false
err := cluster.BuildCluster()
if err != nil {
return err
}
err = cluster.ResolvePoolMembership()
if err != nil {
return err
}
cluster.OK = true
return nil
}
// hard sync cluster // hard sync cluster
func (cluster *Cluster) Sync() error { func (cluster *Cluster) BuildCluster() error {
// aquire lock on cluster, release on return // aquire lock on cluster, release on return
cluster.lock.Lock() cluster.lock.Lock()
defer cluster.lock.Unlock()
cluster.OK = false
cluster.Nodes = make(map[string]*Node) cluster.Nodes = make(map[string]*Node)
@@ -46,15 +62,15 @@ func (cluster *Cluster) Sync() error {
} }
// for each node: // for each node:
for _, hostName := range nodes { for _, nodeName := range nodes {
wg.Go(func() error { wg.Go(func() error {
start := time.Now() start := time.Now()
// rebuild node // rebuild node
err := cluster.RebuildNode(hostName) err := cluster.BuildNode(nodeName)
if err != nil { // if an error was encountered, continue and log the error if err != nil { // if an error was encountered, continue and log the error
log.Printf("[ERR ] error encountered while syncing node %s: %s", hostName, err) log.Printf("[ERR ] error encountered while syncing node %s: %s", nodeName, err)
} else { } else {
log.Printf("[INFO] synced node %s in %d ms", hostName, time.Since(start).Milliseconds()) log.Printf("[INFO] synced node %s in %d ms", nodeName, time.Since(start).Milliseconds())
} }
return err return err
}) })
@@ -66,17 +82,6 @@ func (cluster *Cluster) Sync() error {
return err return err
} }
cluster.lock.Unlock()
err = cluster.ResolvePoolMembership()
if err != nil {
return err
}
cluster.lock.Lock()
cluster.OK = true
cluster.lock.Unlock()
return nil return nil
} }
@@ -115,49 +120,66 @@ func (cluster *Cluster) ResolvePoolMembership() error {
} }
// get a node in the cluster // get a node in the cluster
func (cluster *Cluster) GetNode(hostName string) (*Node, error) { func (cluster *Cluster) GetNode(nodeName string) (*Node, error) {
// aquire cluster lock // aquire cluster lock
cluster.lock.Lock() cluster.lock.Lock()
defer cluster.lock.Unlock() defer cluster.lock.Unlock()
// get host // get node
host, ok := cluster.Nodes[hostName] node, ok := cluster.Nodes[nodeName]
if !ok { if !ok {
return nil, fmt.Errorf("%s not in cluster", hostName) return nil, fmt.Errorf("%s not in cluster", nodeName)
} else { } else {
// aquire host lock to wait in case of a concurrent write // aquire node lock to wait in case of a concurrent write
host.lock.Lock() node.lock.Lock()
defer host.lock.Unlock() defer node.lock.Unlock()
return host, nil return node, nil
} }
} }
func SyncNode(cluster *Cluster, nodeName string) error {
cluster.OK = false
err := cluster.BuildNode(nodeName)
if err != nil {
return err
}
err = cluster.ResolvePoolMembership()
if err != nil {
return err
}
cluster.OK = true
return nil
}
// hard sync node // hard sync node
// returns error if the node could not be reached // returns error if the node could not be reached
func (cluster *Cluster) RebuildNode(hostName string) error { func (cluster *Cluster) BuildNode(nodeName string) error {
host, err := cluster.pve.Node(hostName) node, err := cluster.pve.Node(nodeName)
if err != nil && cluster.Nodes[hostName] == nil { // host is unreachable and did not exist previously if err != nil && cluster.Nodes[nodeName] == nil { // node is unreachable and did not exist previously
// return an error because we requested to sync a node that was not already in the cluster // return an error because we requested to sync a node that was not already in the cluster
return fmt.Errorf("error retrieving %s: %s", hostName, err.Error()) return fmt.Errorf("error retrieving %s: %s", nodeName, err.Error())
} }
// aquire lock on host, release on return // aquire lock on node, release on return
host.lock.Lock() node.lock.Lock()
defer host.lock.Unlock() defer node.lock.Unlock()
wg, _ := errgroup.WithContext(context.Background()) wg, _ := errgroup.WithContext(context.Background())
if err != nil && cluster.Nodes[hostName] != nil { // host is unreachable and did exist previously if err != nil && cluster.Nodes[nodeName] != nil { // node is unreachable and did exist previously
// assume the node is down or gone and delete from cluster // assume the node is down or gone and delete from cluster
delete(cluster.Nodes, hostName) delete(cluster.Nodes, nodeName)
return nil return nil
} }
cluster.Nodes[hostName] = host cluster.Nodes[nodeName] = node
// get node's VMs // get node's VMs
vms, err := host.VirtualMachines() vms, err := node.VirtualMachines()
if err != nil { if err != nil {
return err return err
@@ -165,29 +187,29 @@ func (cluster *Cluster) RebuildNode(hostName string) error {
for _, vmid := range vms { for _, vmid := range vms {
wg.Go(func() error { wg.Go(func() error {
start := time.Now() start := time.Now()
err := host.RebuildInstance(VM, vmid) err := node.BuildInstance(VM, vmid)
if err != nil { // if an error was encountered, continue and log the error if err != nil { // if an error was encountered, continue and log the error
log.Printf("[ERR ] error encountered while syncing vm %s.%d: %s", hostName, vmid, err) log.Printf("[ERR ] error encountered while syncing vm %s.%d: %s", nodeName, vmid, err)
} else { } else {
log.Printf("[INFO] synced vm %s.%d in %d ms", hostName, vmid, time.Since(start).Milliseconds()) log.Printf("[INFO] synced vm %s.%d in %d ms", nodeName, vmid, time.Since(start).Milliseconds())
} }
return err return err
}) })
} }
// get node's CTs // get node's CTs
cts, err := host.Containers() cts, err := node.Containers()
if err != nil { if err != nil {
return err return err
} }
for _, vmid := range cts { for _, vmid := range cts {
wg.Go(func() error { wg.Go(func() error {
start := time.Now() start := time.Now()
err := host.RebuildInstance(CT, vmid) err := node.BuildInstance(CT, vmid)
if err != nil { // if an error was encountered, continue and log the error if err != nil { // if an error was encountered, continue and log the error
log.Printf("[ERR ] error encountered while syncing ct %s.%d: %s", hostName, vmid, err) log.Printf("[ERR ] error encountered while syncing ct %s.%d: %s", nodeName, vmid, err)
} else { } else {
log.Printf("[INFO] synced ct %s.%d in %d ms", hostName, vmid, time.Since(start).Milliseconds()) log.Printf("[INFO] synced ct %s.%d in %d ms", nodeName, vmid, time.Since(start).Milliseconds())
} }
return err return err
@@ -200,7 +222,7 @@ func (cluster *Cluster) RebuildNode(hostName string) error {
} }
// check node device reserved by iterating over each function, we will assume that a single reserved function means the device is also reserved // check node device reserved by iterating over each function, we will assume that a single reserved function means the device is also reserved
for _, device := range host.Devices { for _, device := range node.Devices {
reserved := false reserved := false
for _, function := range device.Functions { for _, function := range device.Functions {
reserved = reserved || function.Reserved reserved = reserved || function.Reserved
@@ -208,18 +230,20 @@ func (cluster *Cluster) RebuildNode(hostName string) error {
device.Reserved = reserved device.Reserved = reserved
} }
node.cluster = cluster
return nil return nil
} }
func (host *Node) GetInstance(vmid uint) (*Instance, error) { func (node *Node) GetInstance(vmid uint) (*Instance, error) {
// aquire host lock // aquire node lock
host.lock.Lock() node.lock.Lock()
defer host.lock.Unlock() defer node.lock.Unlock()
// get instance // get instance
instance, ok := host.Instances[InstanceID(vmid)] instance, ok := node.Instances[InstanceID(vmid)]
if !ok { if !ok {
return nil, fmt.Errorf("vmid %d not in host %s", vmid, host.Name) return nil, fmt.Errorf("vmid %d not in node %s", vmid, node.Name)
} else { } else {
// aquire instance lock to wait in case of a concurrent write // aquire instance lock to wait in case of a concurrent write
instance.lock.Lock() instance.lock.Lock()
@@ -229,23 +253,50 @@ func (host *Node) GetInstance(vmid uint) (*Instance, error) {
} }
} }
func SyncInstance(cluster *Cluster, nodeName string, vmid uint) error {
cluster.OK = false
node, err := cluster.GetNode(nodeName)
if err != nil {
return err
}
instance, err := node.GetInstance(uint(vmid))
if err != nil {
return err
}
err = node.BuildInstance(instance.Type, uint(vmid))
if err != nil {
return err
}
err = cluster.ResolvePoolMembership()
if err != nil {
return err
}
cluster.OK = true
return nil
}
// hard sync instance // hard sync instance
// returns error if the instance could not be reached // returns error if the instance could not be reached
func (host *Node) RebuildInstance(instancetype InstanceType, vmid uint) error { func (node *Node) BuildInstance(instancetype InstanceType, vmid uint) error {
instanceID := InstanceID(vmid) instanceID := InstanceID(vmid)
var instance *Instance var instance *Instance
var err error var err error
switch instancetype { switch instancetype {
case VM: case VM:
instance, err = host.VirtualMachine(vmid) instance, err = node.VirtualMachine(vmid)
case CT: case CT:
instance, err = host.Container(vmid) instance, err = node.Container(vmid)
} }
if err != nil && host.Instances[instanceID] == nil { // instance is unreachable and did not exist previously if err != nil && node.Instances[instanceID] == nil { // instance is unreachable and did not exist previously
// return an error because we requested to sync an instance that was not already in the cluster // return an error because we requested to sync an instance that was not already in the cluster
return fmt.Errorf("error retrieving %s.%d: %s", host.Name, instanceID, err.Error()) return fmt.Errorf("error retrieving %s.%d: %s", node.Name, instanceID, err.Error())
} }
// aquire lock on instance, release on return // aquire lock on instance, release on return
@@ -254,17 +305,17 @@ func (host *Node) RebuildInstance(instancetype InstanceType, vmid uint) error {
wg, _ := errgroup.WithContext(context.Background()) wg, _ := errgroup.WithContext(context.Background())
if err != nil && host.Instances[instanceID] != nil { // host is unreachable and did exist previously if err != nil && node.Instances[instanceID] != nil { // node is unreachable and did exist previously
// assume the instance is gone and delete from cluster // assume the instance is gone and delete from cluster
delete(host.Instances, instanceID) delete(node.Instances, instanceID)
return nil return nil
} }
host.Instances[instanceID] = instance node.Instances[instanceID] = instance
for volid := range instance.configDisks { for volid := range instance.configDisks {
wg.Go(func() error { wg.Go(func() error {
err = instance.RebuildVolume(host, volid) err = instance.RebuildVolume(node, volid)
if err != nil { if err != nil {
log.Printf("[ERR ] error rebuilding volume %s: %s", volid, err) log.Printf("[ERR ] error rebuilding volume %s: %s", volid, err)
} }
@@ -274,7 +325,7 @@ func (host *Node) RebuildInstance(instancetype InstanceType, vmid uint) error {
for netid := range instance.configNets { for netid := range instance.configNets {
wg.Go(func() error { wg.Go(func() error {
err = instance.RebuildNet(host, netid) err = instance.RebuildNet(node, netid)
if err != nil { if err != nil {
log.Printf("[ERR ] error rebuilding net %s: %s", netid, err) log.Printf("[ERR ] error rebuilding net %s: %s", netid, err)
return err return err
@@ -285,7 +336,7 @@ func (host *Node) RebuildInstance(instancetype InstanceType, vmid uint) error {
for deviceid := range instance.configHostPCIs { for deviceid := range instance.configHostPCIs {
wg.Go(func() error { wg.Go(func() error {
err = instance.RebuildDevice(host, deviceid) err = instance.RebuildDevice(node, deviceid)
if err != nil { if err != nil {
log.Printf("[ERR ] error rebuilding pci %s: %s", deviceid, err) log.Printf("[ERR ] error rebuilding pci %s: %s", deviceid, err)
} }
@@ -299,20 +350,21 @@ func (host *Node) RebuildInstance(instancetype InstanceType, vmid uint) error {
} }
if instance.Type == VM { if instance.Type == VM {
err = instance.RebuildBoot(host) err = instance.RebuildBoot(node)
if err != nil { if err != nil {
log.Printf("[ERR ] error rebuilding boot: %s", err) log.Printf("[ERR ] error rebuilding boot: %s", err)
} }
return err return err
} else {
return nil
} }
instance.node = node
return nil
} }
func (instance *Instance) RebuildVolume(host *Node, volid string) error { func (instance *Instance) RebuildVolume(node *Node, volid string) error {
volumeDataString := instance.configDisks[volid] volumeDataString := instance.configDisks[volid]
volume, err := GetVolumeInfo(host, volumeDataString) volume, err := GetVolumeInfo(node, volumeDataString)
if err != nil { if err != nil {
return err return err
} }
@@ -325,7 +377,7 @@ func (instance *Instance) RebuildVolume(host *Node, volid string) error {
return nil return nil
} }
func (instance *Instance) RebuildNet(host *Node, netid string) error { func (instance *Instance) RebuildNet(node *Node, netid string) error {
net := instance.configNets[netid] net := instance.configNets[netid]
netinfo, err := GetNetInfo(net) netinfo, err := GetNetInfo(net)
@@ -339,10 +391,10 @@ func (instance *Instance) RebuildNet(host *Node, netid string) error {
return nil return nil
} }
func (instance *Instance) RebuildDevice(host *Node, deviceid string) error { func (instance *Instance) RebuildDevice(node *Node, deviceid string) error {
instanceDevice, ok := instance.configHostPCIs[deviceid] instanceDevice, ok := instance.configHostPCIs[deviceid]
if !ok { // if device does not exist if !ok { // if device does not exist
log.Printf("[WARN] %s not found in devices on node %s", deviceid, host.Name) log.Printf("[WARN] %s not found in devices on node %s", deviceid, node.Name)
return nil return nil
} }
@@ -350,7 +402,7 @@ func (instance *Instance) RebuildDevice(host *Node, deviceid string) error {
instanceDeviceBusID := DeviceID(deviceid) instanceDeviceBusID := DeviceID(deviceid)
if DeviceBusIDIsSuperDevice(hostDeviceBusID) { if DeviceBusIDIsSuperDevice(hostDeviceBusID) {
instance.Devices[DeviceID(instanceDeviceBusID)] = host.Devices[DeviceBus(hostDeviceBusID)] instance.Devices[DeviceID(instanceDeviceBusID)] = node.Devices[DeviceBus(hostDeviceBusID)]
for _, function := range instance.Devices[DeviceID(instanceDeviceBusID)].Functions { for _, function := range instance.Devices[DeviceID(instanceDeviceBusID)].Functions {
function.Reserved = true function.Reserved = true
} }
@@ -363,7 +415,7 @@ func (instance *Instance) RebuildDevice(host *Node, deviceid string) error {
return nil return nil
} }
func (instance *Instance) RebuildBoot(host *Node) error { func (instance *Instance) RebuildBoot(node *Node) error {
instance.Boot = BootOrder{} instance.Boot = BootOrder{}
eligibleBoot := map[string]bool{} eligibleBoot := map[string]bool{}
+2
View File
@@ -28,6 +28,7 @@ type Node struct {
Instances map[InstanceID]*Instance `json:"instances"` Instances map[InstanceID]*Instance `json:"instances"`
pvenode *proxmox.Node pvenode *proxmox.Node
storage map[string][]*proxmox.StorageContent storage map[string][]*proxmox.StorageContent
cluster *Cluster
} }
type InstanceID = paas.InstanceID type InstanceID = paas.InstanceID
@@ -45,6 +46,7 @@ type Instance struct {
configNets map[string]string configNets map[string]string
configHostPCIs map[string]string configHostPCIs map[string]string
configBoot string configBoot string
node *Node
} }
type VolumeID = paas.VolumeID type VolumeID = paas.VolumeID