improve sync latency significantly by using goroutines

This commit is contained in:
2026-06-01 21:17:51 +00:00
parent a98745dd17
commit f1deacee10
3 changed files with 204 additions and 202 deletions
+4 -3
View File
@@ -39,7 +39,7 @@ func Run() {
if err != nil { if err != nil {
log.Printf("[Error] error encountered while syncing cluster: %s", err) log.Printf("[Error] error encountered while syncing cluster: %s", err)
} else { } else {
log.Printf("[INFO] synced cluster in %fs\n", time.Since(start).Seconds()) log.Printf("[INFO] synced cluster in %d ms\n", time.Since(start).Milliseconds())
} }
// set repeating update for full rebuilds // set repeating update for full rebuilds
@@ -56,10 +56,11 @@ func Run() {
log.Printf("[INFO] starting cluster sync\n") log.Printf("[INFO] starting cluster sync\n")
err := cluster.Sync() err := cluster.Sync()
if err != nil { if err != nil {
log.Printf("[Error] error encountered while syncing cluster: %s", err) log.Printf("[ERR ] error encountered while syncing cluster: %s", err)
} else { } else {
log.Printf("[INFO] synced cluster in %fs\n", time.Since(start).Seconds()) log.Printf("[INFO] synced cluster in %d ms", time.Since(start).Milliseconds())
} }
} }
} }
}() }()
+199 -199
View File
@@ -5,8 +5,11 @@ import (
"fmt" "fmt"
"log" "log"
"strings" "strings"
"time"
paas "proxmoxaas-common-lib" paas "proxmoxaas-common-lib"
"golang.org/x/sync/errgroup"
) )
func (cluster *Cluster) Init(pve ProxmoxClient) { func (cluster *Cluster) Init(pve ProxmoxClient) {
@@ -14,287 +17,284 @@ func (cluster *Cluster) Init(pve ProxmoxClient) {
} }
func (cluster *Cluster) Get() (*Cluster, error) { func (cluster *Cluster) Get() (*Cluster, error) {
cluster_ch := make(chan *Cluster) // aquire cluster lock
err_ch := make(chan error) cluster.lock.Lock()
defer cluster.lock.Unlock()
go func() { return cluster, nil
// aquire cluster lock
cluster.lock.Lock()
defer cluster.lock.Unlock()
cluster_ch <- cluster
err_ch <- nil
}()
return <-cluster_ch, <-err_ch
} }
// hard sync cluster // hard sync cluster
func (cluster *Cluster) Sync() error { func (cluster *Cluster) Sync() error {
err_ch := make(chan error) // aquire lock on cluster, release on return
cluster.lock.Lock()
go func() { cluster.Nodes = make(map[string]*Node)
// aquire lock on cluster, release on return
cluster.lock.Lock()
cluster.Nodes = make(map[string]*Node) wg, _ := errgroup.WithContext(context.Background())
// get all nodes // get all nodes
nodes, err := cluster.pve.Nodes() nodes, err := cluster.pve.Nodes()
if err != nil { if err != nil {
err_ch <- err return err
return }
}
// for each node: // for each node:
for _, hostName := range nodes { for _, hostName := range nodes {
wg.Go(func() error {
start := time.Now()
// rebuild node // rebuild node
err := cluster.RebuildNode(hostName) err := cluster.RebuildNode(hostName)
if err != nil { // if an error was encountered, continue and log the error if err != nil { // if an error was encountered, continue and log the error
log.Printf("[ERR ] %s", err) log.Printf("[ERR ] error encountered while syncing node %s: %s", hostName, err)
} else { // otherwise log success } else {
log.Printf("[INFO] successfully synced node %s", hostName) log.Printf("[INFO] synced node %s in %d ms", hostName, time.Since(start).Milliseconds())
} }
} return err
})
}
cluster.lock.Unlock() err = wg.Wait()
if err != nil {
return err
}
err = cluster.ResolvePoolMembership() cluster.lock.Unlock()
if err != nil {
err_ch <- err
}
err_ch <- nil err = cluster.ResolvePoolMembership()
}() if err != nil {
return err
}
return <-err_ch return nil
} }
func (cluster *Cluster) ResolvePoolMembership() error { func (cluster *Cluster) ResolvePoolMembership() error {
err_ch := make(chan error) // aquire lock on cluster, release on return
cluster.lock.Lock()
go func() { //resolve pool membership
// aquire lock on cluster, release on return pools, err := cluster.pve.client.Pools(context.Background())
cluster.lock.Lock() if err != nil {
return err
//resolve pool membership }
pools, err := cluster.pve.client.Pools(context.Background()) for _, pool := range pools {
pool, err = cluster.pve.client.Pool(context.Background(), pool.PoolID)
if err != nil { if err != nil {
err_ch <- err return err
} }
for _, pool := range pools { for _, member := range pool.Members {
pool, err = cluster.pve.client.Pool(context.Background(), pool.PoolID) if member.Type == "lxc" || member.Type == "qemu" {
if err != nil { node, ok := cluster.Nodes[member.Node]
err_ch <- err if !ok {
} return fmt.Errorf("Instance %d has no node", member.VMID)
for _, member := range pool.Members {
if member.Type == "lxc" || member.Type == "qemu" {
node, ok := cluster.Nodes[member.Node]
if !ok {
err_ch <- fmt.Errorf("Instance %d has no node", member.VMID)
}
instance, ok := node.Instances[InstanceID(member.VMID)]
if !ok {
err_ch <- fmt.Errorf("Instance %d claimed to be in node %s but was not", member.VMID, node.Name)
}
instance.Pool = pool.PoolID
log.Printf("[INFO] successfully resolved pool membership for vmid=%d pool=%s", member.VMID, pool.PoolID)
} }
instance, ok := node.Instances[InstanceID(member.VMID)]
if !ok {
return fmt.Errorf("Instance %d claimed to be in node %s but was not", member.VMID, node.Name)
}
instance.Pool = pool.PoolID
log.Printf("[INFO] resolved pool membership for vmid=%d pool=%s", member.VMID, pool.PoolID)
} }
} }
}
err_ch <- nil cluster.lock.Unlock()
return nil
cluster.lock.Unlock()
}()
return <-err_ch
} }
// get a node in the cluster // get a node in the cluster
func (cluster *Cluster) GetNode(hostName string) (*Node, error) { func (cluster *Cluster) GetNode(hostName string) (*Node, error) {
host_ch := make(chan *Node) // aquire cluster lock
err_ch := make(chan error) cluster.lock.Lock()
defer cluster.lock.Unlock()
// get host
host, ok := cluster.Nodes[hostName]
if !ok {
return nil, fmt.Errorf("%s not in cluster", hostName)
} else {
// aquire host lock to wait in case of a concurrent write
host.lock.Lock()
defer host.lock.Unlock()
go func() { return host, nil
// aquire cluster lock }
cluster.lock.Lock()
defer cluster.lock.Unlock()
// get host
host, ok := cluster.Nodes[hostName]
if !ok {
host_ch <- nil
err_ch <- fmt.Errorf("%s not in cluster", hostName)
} else {
// aquire host lock to wait in case of a concurrent write
host.lock.Lock()
defer host.lock.Unlock()
host_ch <- host
err_ch <- nil
}
}()
return <-host_ch, <-err_ch
} }
// hard sync node // hard sync node
// returns error if the node could not be reached // returns error if the node could not be reached
func (cluster *Cluster) RebuildNode(hostName string) error { func (cluster *Cluster) RebuildNode(hostName string) error {
err_ch := make(chan error) host, err := cluster.pve.Node(hostName)
if err != nil && cluster.Nodes[hostName] == nil { // host is unreachable and did not exist previously
// return an error because we requested to sync a node that was not already in the cluster
return fmt.Errorf("error retrieving %s: %s", hostName, err.Error())
}
go func() { // aquire lock on host, release on return
host, err := cluster.pve.Node(hostName) host.lock.Lock()
if err != nil && cluster.Nodes[hostName] == nil { // host is unreachable and did not exist previously defer host.lock.Unlock()
// return an error because we requested to sync a node that was not already in the cluster
err_ch <- fmt.Errorf("error retrieving %s: %s", hostName, err.Error())
}
// aquire lock on host, release on return wg, _ := errgroup.WithContext(context.Background())
host.lock.Lock()
defer host.lock.Unlock()
if err != nil && cluster.Nodes[hostName] != nil { // host is unreachable and did exist previously if err != nil && cluster.Nodes[hostName] != nil { // host is unreachable and did exist previously
// assume the node is down or gone and delete from cluster // assume the node is down or gone and delete from cluster
delete(cluster.Nodes, hostName) delete(cluster.Nodes, hostName)
err_ch <- nil return nil
} }
cluster.Nodes[hostName] = host cluster.Nodes[hostName] = host
// get node's VMs // get node's VMs
vms, err := host.VirtualMachines() vms, err := host.VirtualMachines()
if err != nil { if err != nil {
err_ch <- err return err
} }
for _, vmid := range vms { for _, vmid := range vms {
wg.Go(func() error {
start := time.Now()
err := host.RebuildInstance(VM, vmid) err := host.RebuildInstance(VM, vmid)
if err != nil { // if an error was encountered, continue and log the error if err != nil { // if an error was encountered, continue and log the error
log.Printf("[ERR ] %s", err) log.Printf("[ERR ] error encountered while syncing vm %s.%d: %s", hostName, vmid, err)
} else { } else {
log.Printf("[INFO] successfully synced vm %s.%d", hostName, vmid) log.Printf("[INFO] synced vm %s.%d in %d ms", hostName, vmid, time.Since(start).Milliseconds())
} }
} return err
})
}
// get node's CTs // get node's CTs
cts, err := host.Containers() cts, err := host.Containers()
if err != nil { if err != nil {
err_ch <- err return err
} }
for _, vmid := range cts { for _, vmid := range cts {
wg.Go(func() error {
start := time.Now()
err := host.RebuildInstance(CT, vmid) err := host.RebuildInstance(CT, vmid)
if err != nil { // if an error was encountered, continue and log the error if err != nil { // if an error was encountered, continue and log the error
log.Printf("[ERR ] %s", err) log.Printf("[ERR ] error encountered while syncing ct %s.%d: %s", hostName, vmid, err)
} else { } else {
log.Printf("[INFO] successfully synced ct %s.%d", hostName, vmid) log.Printf("[INFO] synced ct %s.%d in %d ms", hostName, vmid, time.Since(start).Milliseconds())
}
}
// check node device reserved by iterating over each function, we will assume that a single reserved function means the device is also reserved
for _, device := range host.Devices {
reserved := false
for _, function := range device.Functions {
reserved = reserved || function.Reserved
} }
device.Reserved = reserved return err
} })
}
err_ch <- nil err = wg.Wait()
}() if err != nil {
return <-err_ch return err
}
// check node device reserved by iterating over each function, we will assume that a single reserved function means the device is also reserved
for _, device := range host.Devices {
reserved := false
for _, function := range device.Functions {
reserved = reserved || function.Reserved
}
device.Reserved = reserved
}
return nil
} }
func (host *Node) GetInstance(vmid uint) (*Instance, error) { func (host *Node) GetInstance(vmid uint) (*Instance, error) {
instance_ch := make(chan *Instance)
err_ch := make(chan error)
go func() { // aquire host lock
// aquire host lock host.lock.Lock()
host.lock.Lock() defer host.lock.Unlock()
defer host.lock.Unlock() // get instance
// get instance instance, ok := host.Instances[InstanceID(vmid)]
instance, ok := host.Instances[InstanceID(vmid)] if !ok {
if !ok { return nil, fmt.Errorf("vmid %d not in host %s", vmid, host.Name)
instance_ch <- nil } else {
err_ch <- fmt.Errorf("vmid %d not in host %s", vmid, host.Name) // aquire instance lock to wait in case of a concurrent write
} else { instance.lock.Lock()
// aquire instance lock to wait in case of a concurrent write defer instance.lock.Unlock()
instance.lock.Lock()
defer instance.lock.Unlock()
instance_ch <- instance return instance, nil
err_ch <- nil }
}
}()
return <-instance_ch, <-err_ch
} }
// hard sync instance // hard sync instance
// returns error if the instance could not be reached // returns error if the instance could not be reached
func (host *Node) RebuildInstance(instancetype InstanceType, vmid uint) error { func (host *Node) RebuildInstance(instancetype InstanceType, vmid uint) error {
err_ch := make(chan error) instanceID := InstanceID(vmid)
var instance *Instance
var err error
switch instancetype {
case VM:
instance, err = host.VirtualMachine(vmid)
case CT:
instance, err = host.Container(vmid)
go func() { }
instanceID := InstanceID(vmid)
var instance *Instance
var err error
switch instancetype {
case VM:
instance, err = host.VirtualMachine(vmid)
case CT:
instance, err = host.Container(vmid)
} if err != nil && host.Instances[instanceID] == nil { // instance is unreachable and did not exist previously
// return an error because we requested to sync an instance that was not already in the cluster
return fmt.Errorf("error retrieving %s.%d: %s", host.Name, instanceID, err.Error())
}
if err != nil && host.Instances[instanceID] == nil { // instance is unreachable and did not exist previously // aquire lock on instance, release on return
// return an error because we requested to sync an instance that was not already in the cluster instance.lock.Lock()
err_ch <- fmt.Errorf("error retrieving %s.%d: %s", host.Name, instanceID, err.Error()) defer instance.lock.Unlock()
}
// aquire lock on instance, release on return wg, _ := errgroup.WithContext(context.Background())
instance.lock.Lock()
defer instance.lock.Unlock()
if err != nil && host.Instances[instanceID] != nil { // host is unreachable and did exist previously if err != nil && host.Instances[instanceID] != nil { // host is unreachable and did exist previously
// assume the instance is gone and delete from cluster // assume the instance is gone and delete from cluster
delete(host.Instances, instanceID) delete(host.Instances, instanceID)
err_ch <- nil return nil
} }
host.Instances[instanceID] = instance host.Instances[instanceID] = instance
for volid := range instance.configDisks { for volid := range instance.configDisks {
wg.Go(func() error {
err = instance.RebuildVolume(host, volid) err = instance.RebuildVolume(host, volid)
if err != nil { if err != nil {
err_ch <- err log.Printf("[ERR ] error rebuilding volume %s: %s", volid, err)
} }
} return err
})
}
for netid := range instance.configNets { for netid := range instance.configNets {
wg.Go(func() error {
err = instance.RebuildNet(host, netid) err = instance.RebuildNet(host, netid)
if err != nil { if err != nil {
err_ch <- err log.Printf("[ERR ] error rebuilding net %s: %s", netid, err)
return err
} }
} return err
})
}
for deviceid := range instance.configHostPCIs { for deviceid := range instance.configHostPCIs {
wg.Go(func() error {
err = instance.RebuildDevice(host, deviceid) err = instance.RebuildDevice(host, deviceid)
if err != nil { if err != nil {
err_ch <- err log.Printf("[ERR ] error rebuilding pci %s: %s", deviceid, err)
} }
return err
})
}
err = wg.Wait()
if err != nil {
return err
}
if instance.Type == VM {
err = instance.RebuildBoot(host)
if err != nil {
log.Printf("[ERR ] error rebuilding boot: %s", err)
} }
return err
if instance.Type == VM { } else {
err = instance.RebuildBoot(host) return nil
if err != nil { }
err_ch <- err
}
}
err_ch <- nil
}()
return <-err_ch
} }
func (instance *Instance) RebuildVolume(host *Node, volid string) error { func (instance *Instance) RebuildVolume(host *Node, volid string) error {
+1
View File
@@ -5,6 +5,7 @@ go 1.26.2
require ( require (
github.com/gin-gonic/gin v1.12.0 github.com/gin-gonic/gin v1.12.0
github.com/luthermonson/go-proxmox v0.7.0 github.com/luthermonson/go-proxmox v0.7.0
golang.org/x/sync v0.20.0
proxmoxaas-common-lib v0.0.0 proxmoxaas-common-lib v0.0.0
) )