improve sync latency significantly by using goroutines
This commit is contained in:
+4
-3
@@ -39,7 +39,7 @@ func Run() {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("[Error] error encountered while syncing cluster: %s", err)
|
log.Printf("[Error] error encountered while syncing cluster: %s", err)
|
||||||
} else {
|
} else {
|
||||||
log.Printf("[INFO] synced cluster in %fs\n", time.Since(start).Seconds())
|
log.Printf("[INFO] synced cluster in %d ms\n", time.Since(start).Milliseconds())
|
||||||
}
|
}
|
||||||
|
|
||||||
// set repeating update for full rebuilds
|
// set repeating update for full rebuilds
|
||||||
@@ -56,10 +56,11 @@ func Run() {
|
|||||||
log.Printf("[INFO] starting cluster sync\n")
|
log.Printf("[INFO] starting cluster sync\n")
|
||||||
err := cluster.Sync()
|
err := cluster.Sync()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("[Error] error encountered while syncing cluster: %s", err)
|
log.Printf("[ERR ] error encountered while syncing cluster: %s", err)
|
||||||
} else {
|
} else {
|
||||||
log.Printf("[INFO] synced cluster in %fs\n", time.Since(start).Seconds())
|
log.Printf("[INFO] synced cluster in %d ms", time.Since(start).Milliseconds())
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|||||||
+199
-199
@@ -5,8 +5,11 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
paas "proxmoxaas-common-lib"
|
paas "proxmoxaas-common-lib"
|
||||||
|
|
||||||
|
"golang.org/x/sync/errgroup"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (cluster *Cluster) Init(pve ProxmoxClient) {
|
func (cluster *Cluster) Init(pve ProxmoxClient) {
|
||||||
@@ -14,287 +17,284 @@ func (cluster *Cluster) Init(pve ProxmoxClient) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (cluster *Cluster) Get() (*Cluster, error) {
|
func (cluster *Cluster) Get() (*Cluster, error) {
|
||||||
cluster_ch := make(chan *Cluster)
|
// aquire cluster lock
|
||||||
err_ch := make(chan error)
|
cluster.lock.Lock()
|
||||||
|
defer cluster.lock.Unlock()
|
||||||
|
|
||||||
go func() {
|
return cluster, nil
|
||||||
// aquire cluster lock
|
|
||||||
cluster.lock.Lock()
|
|
||||||
defer cluster.lock.Unlock()
|
|
||||||
cluster_ch <- cluster
|
|
||||||
err_ch <- nil
|
|
||||||
}()
|
|
||||||
|
|
||||||
return <-cluster_ch, <-err_ch
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// hard sync cluster
|
// hard sync cluster
|
||||||
func (cluster *Cluster) Sync() error {
|
func (cluster *Cluster) Sync() error {
|
||||||
err_ch := make(chan error)
|
// aquire lock on cluster, release on return
|
||||||
|
cluster.lock.Lock()
|
||||||
|
|
||||||
go func() {
|
cluster.Nodes = make(map[string]*Node)
|
||||||
// aquire lock on cluster, release on return
|
|
||||||
cluster.lock.Lock()
|
|
||||||
|
|
||||||
cluster.Nodes = make(map[string]*Node)
|
wg, _ := errgroup.WithContext(context.Background())
|
||||||
|
|
||||||
// get all nodes
|
// get all nodes
|
||||||
nodes, err := cluster.pve.Nodes()
|
nodes, err := cluster.pve.Nodes()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err_ch <- err
|
return err
|
||||||
return
|
}
|
||||||
}
|
|
||||||
// for each node:
|
// for each node:
|
||||||
for _, hostName := range nodes {
|
for _, hostName := range nodes {
|
||||||
|
wg.Go(func() error {
|
||||||
|
start := time.Now()
|
||||||
// rebuild node
|
// rebuild node
|
||||||
err := cluster.RebuildNode(hostName)
|
err := cluster.RebuildNode(hostName)
|
||||||
if err != nil { // if an error was encountered, continue and log the error
|
if err != nil { // if an error was encountered, continue and log the error
|
||||||
log.Printf("[ERR ] %s", err)
|
log.Printf("[ERR ] error encountered while syncing node %s: %s", hostName, err)
|
||||||
} else { // otherwise log success
|
} else {
|
||||||
log.Printf("[INFO] successfully synced node %s", hostName)
|
log.Printf("[INFO] synced node %s in %d ms", hostName, time.Since(start).Milliseconds())
|
||||||
}
|
}
|
||||||
}
|
return err
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
cluster.lock.Unlock()
|
err = wg.Wait()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
err = cluster.ResolvePoolMembership()
|
cluster.lock.Unlock()
|
||||||
if err != nil {
|
|
||||||
err_ch <- err
|
|
||||||
}
|
|
||||||
|
|
||||||
err_ch <- nil
|
err = cluster.ResolvePoolMembership()
|
||||||
}()
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
return <-err_ch
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cluster *Cluster) ResolvePoolMembership() error {
|
func (cluster *Cluster) ResolvePoolMembership() error {
|
||||||
err_ch := make(chan error)
|
// aquire lock on cluster, release on return
|
||||||
|
cluster.lock.Lock()
|
||||||
|
|
||||||
go func() {
|
//resolve pool membership
|
||||||
// aquire lock on cluster, release on return
|
pools, err := cluster.pve.client.Pools(context.Background())
|
||||||
cluster.lock.Lock()
|
if err != nil {
|
||||||
|
return err
|
||||||
//resolve pool membership
|
}
|
||||||
pools, err := cluster.pve.client.Pools(context.Background())
|
for _, pool := range pools {
|
||||||
|
pool, err = cluster.pve.client.Pool(context.Background(), pool.PoolID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err_ch <- err
|
return err
|
||||||
}
|
}
|
||||||
for _, pool := range pools {
|
for _, member := range pool.Members {
|
||||||
pool, err = cluster.pve.client.Pool(context.Background(), pool.PoolID)
|
if member.Type == "lxc" || member.Type == "qemu" {
|
||||||
if err != nil {
|
node, ok := cluster.Nodes[member.Node]
|
||||||
err_ch <- err
|
if !ok {
|
||||||
}
|
return fmt.Errorf("Instance %d has no node", member.VMID)
|
||||||
for _, member := range pool.Members {
|
|
||||||
if member.Type == "lxc" || member.Type == "qemu" {
|
|
||||||
node, ok := cluster.Nodes[member.Node]
|
|
||||||
if !ok {
|
|
||||||
err_ch <- fmt.Errorf("Instance %d has no node", member.VMID)
|
|
||||||
}
|
|
||||||
instance, ok := node.Instances[InstanceID(member.VMID)]
|
|
||||||
if !ok {
|
|
||||||
err_ch <- fmt.Errorf("Instance %d claimed to be in node %s but was not", member.VMID, node.Name)
|
|
||||||
}
|
|
||||||
instance.Pool = pool.PoolID
|
|
||||||
log.Printf("[INFO] successfully resolved pool membership for vmid=%d pool=%s", member.VMID, pool.PoolID)
|
|
||||||
}
|
}
|
||||||
|
instance, ok := node.Instances[InstanceID(member.VMID)]
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("Instance %d claimed to be in node %s but was not", member.VMID, node.Name)
|
||||||
|
}
|
||||||
|
instance.Pool = pool.PoolID
|
||||||
|
log.Printf("[INFO] resolved pool membership for vmid=%d pool=%s", member.VMID, pool.PoolID)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
err_ch <- nil
|
cluster.lock.Unlock()
|
||||||
|
return nil
|
||||||
cluster.lock.Unlock()
|
|
||||||
}()
|
|
||||||
|
|
||||||
return <-err_ch
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// get a node in the cluster
|
// get a node in the cluster
|
||||||
func (cluster *Cluster) GetNode(hostName string) (*Node, error) {
|
func (cluster *Cluster) GetNode(hostName string) (*Node, error) {
|
||||||
host_ch := make(chan *Node)
|
// aquire cluster lock
|
||||||
err_ch := make(chan error)
|
cluster.lock.Lock()
|
||||||
|
defer cluster.lock.Unlock()
|
||||||
|
// get host
|
||||||
|
host, ok := cluster.Nodes[hostName]
|
||||||
|
if !ok {
|
||||||
|
return nil, fmt.Errorf("%s not in cluster", hostName)
|
||||||
|
} else {
|
||||||
|
// aquire host lock to wait in case of a concurrent write
|
||||||
|
host.lock.Lock()
|
||||||
|
defer host.lock.Unlock()
|
||||||
|
|
||||||
go func() {
|
return host, nil
|
||||||
// aquire cluster lock
|
}
|
||||||
cluster.lock.Lock()
|
|
||||||
defer cluster.lock.Unlock()
|
|
||||||
// get host
|
|
||||||
host, ok := cluster.Nodes[hostName]
|
|
||||||
if !ok {
|
|
||||||
host_ch <- nil
|
|
||||||
err_ch <- fmt.Errorf("%s not in cluster", hostName)
|
|
||||||
} else {
|
|
||||||
// aquire host lock to wait in case of a concurrent write
|
|
||||||
host.lock.Lock()
|
|
||||||
defer host.lock.Unlock()
|
|
||||||
|
|
||||||
host_ch <- host
|
|
||||||
err_ch <- nil
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
return <-host_ch, <-err_ch
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// hard sync node
|
// hard sync node
|
||||||
// returns error if the node could not be reached
|
// returns error if the node could not be reached
|
||||||
func (cluster *Cluster) RebuildNode(hostName string) error {
|
func (cluster *Cluster) RebuildNode(hostName string) error {
|
||||||
err_ch := make(chan error)
|
host, err := cluster.pve.Node(hostName)
|
||||||
|
if err != nil && cluster.Nodes[hostName] == nil { // host is unreachable and did not exist previously
|
||||||
|
// return an error because we requested to sync a node that was not already in the cluster
|
||||||
|
return fmt.Errorf("error retrieving %s: %s", hostName, err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
go func() {
|
// aquire lock on host, release on return
|
||||||
host, err := cluster.pve.Node(hostName)
|
host.lock.Lock()
|
||||||
if err != nil && cluster.Nodes[hostName] == nil { // host is unreachable and did not exist previously
|
defer host.lock.Unlock()
|
||||||
// return an error because we requested to sync a node that was not already in the cluster
|
|
||||||
err_ch <- fmt.Errorf("error retrieving %s: %s", hostName, err.Error())
|
|
||||||
}
|
|
||||||
|
|
||||||
// aquire lock on host, release on return
|
wg, _ := errgroup.WithContext(context.Background())
|
||||||
host.lock.Lock()
|
|
||||||
defer host.lock.Unlock()
|
|
||||||
|
|
||||||
if err != nil && cluster.Nodes[hostName] != nil { // host is unreachable and did exist previously
|
if err != nil && cluster.Nodes[hostName] != nil { // host is unreachable and did exist previously
|
||||||
// assume the node is down or gone and delete from cluster
|
// assume the node is down or gone and delete from cluster
|
||||||
delete(cluster.Nodes, hostName)
|
delete(cluster.Nodes, hostName)
|
||||||
err_ch <- nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
cluster.Nodes[hostName] = host
|
cluster.Nodes[hostName] = host
|
||||||
|
|
||||||
// get node's VMs
|
// get node's VMs
|
||||||
vms, err := host.VirtualMachines()
|
vms, err := host.VirtualMachines()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err_ch <- err
|
return err
|
||||||
|
|
||||||
}
|
}
|
||||||
for _, vmid := range vms {
|
for _, vmid := range vms {
|
||||||
|
wg.Go(func() error {
|
||||||
|
start := time.Now()
|
||||||
err := host.RebuildInstance(VM, vmid)
|
err := host.RebuildInstance(VM, vmid)
|
||||||
if err != nil { // if an error was encountered, continue and log the error
|
if err != nil { // if an error was encountered, continue and log the error
|
||||||
log.Printf("[ERR ] %s", err)
|
log.Printf("[ERR ] error encountered while syncing vm %s.%d: %s", hostName, vmid, err)
|
||||||
} else {
|
} else {
|
||||||
log.Printf("[INFO] successfully synced vm %s.%d", hostName, vmid)
|
log.Printf("[INFO] synced vm %s.%d in %d ms", hostName, vmid, time.Since(start).Milliseconds())
|
||||||
}
|
}
|
||||||
}
|
return err
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
// get node's CTs
|
// get node's CTs
|
||||||
cts, err := host.Containers()
|
cts, err := host.Containers()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err_ch <- err
|
return err
|
||||||
}
|
}
|
||||||
for _, vmid := range cts {
|
for _, vmid := range cts {
|
||||||
|
wg.Go(func() error {
|
||||||
|
start := time.Now()
|
||||||
err := host.RebuildInstance(CT, vmid)
|
err := host.RebuildInstance(CT, vmid)
|
||||||
if err != nil { // if an error was encountered, continue and log the error
|
if err != nil { // if an error was encountered, continue and log the error
|
||||||
log.Printf("[ERR ] %s", err)
|
log.Printf("[ERR ] error encountered while syncing ct %s.%d: %s", hostName, vmid, err)
|
||||||
} else {
|
} else {
|
||||||
log.Printf("[INFO] successfully synced ct %s.%d", hostName, vmid)
|
log.Printf("[INFO] synced ct %s.%d in %d ms", hostName, vmid, time.Since(start).Milliseconds())
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// check node device reserved by iterating over each function, we will assume that a single reserved function means the device is also reserved
|
|
||||||
for _, device := range host.Devices {
|
|
||||||
reserved := false
|
|
||||||
for _, function := range device.Functions {
|
|
||||||
reserved = reserved || function.Reserved
|
|
||||||
}
|
}
|
||||||
device.Reserved = reserved
|
return err
|
||||||
}
|
})
|
||||||
|
}
|
||||||
|
|
||||||
err_ch <- nil
|
err = wg.Wait()
|
||||||
}()
|
if err != nil {
|
||||||
return <-err_ch
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// check node device reserved by iterating over each function, we will assume that a single reserved function means the device is also reserved
|
||||||
|
for _, device := range host.Devices {
|
||||||
|
reserved := false
|
||||||
|
for _, function := range device.Functions {
|
||||||
|
reserved = reserved || function.Reserved
|
||||||
|
}
|
||||||
|
device.Reserved = reserved
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (host *Node) GetInstance(vmid uint) (*Instance, error) {
|
func (host *Node) GetInstance(vmid uint) (*Instance, error) {
|
||||||
instance_ch := make(chan *Instance)
|
|
||||||
err_ch := make(chan error)
|
|
||||||
|
|
||||||
go func() {
|
// aquire host lock
|
||||||
// aquire host lock
|
host.lock.Lock()
|
||||||
host.lock.Lock()
|
defer host.lock.Unlock()
|
||||||
defer host.lock.Unlock()
|
// get instance
|
||||||
// get instance
|
instance, ok := host.Instances[InstanceID(vmid)]
|
||||||
instance, ok := host.Instances[InstanceID(vmid)]
|
if !ok {
|
||||||
if !ok {
|
return nil, fmt.Errorf("vmid %d not in host %s", vmid, host.Name)
|
||||||
instance_ch <- nil
|
} else {
|
||||||
err_ch <- fmt.Errorf("vmid %d not in host %s", vmid, host.Name)
|
// aquire instance lock to wait in case of a concurrent write
|
||||||
} else {
|
instance.lock.Lock()
|
||||||
// aquire instance lock to wait in case of a concurrent write
|
defer instance.lock.Unlock()
|
||||||
instance.lock.Lock()
|
|
||||||
defer instance.lock.Unlock()
|
|
||||||
|
|
||||||
instance_ch <- instance
|
return instance, nil
|
||||||
err_ch <- nil
|
}
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
return <-instance_ch, <-err_ch
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// hard sync instance
|
// hard sync instance
|
||||||
// returns error if the instance could not be reached
|
// returns error if the instance could not be reached
|
||||||
func (host *Node) RebuildInstance(instancetype InstanceType, vmid uint) error {
|
func (host *Node) RebuildInstance(instancetype InstanceType, vmid uint) error {
|
||||||
err_ch := make(chan error)
|
instanceID := InstanceID(vmid)
|
||||||
|
var instance *Instance
|
||||||
|
var err error
|
||||||
|
switch instancetype {
|
||||||
|
case VM:
|
||||||
|
instance, err = host.VirtualMachine(vmid)
|
||||||
|
case CT:
|
||||||
|
instance, err = host.Container(vmid)
|
||||||
|
|
||||||
go func() {
|
}
|
||||||
instanceID := InstanceID(vmid)
|
|
||||||
var instance *Instance
|
|
||||||
var err error
|
|
||||||
switch instancetype {
|
|
||||||
case VM:
|
|
||||||
instance, err = host.VirtualMachine(vmid)
|
|
||||||
case CT:
|
|
||||||
instance, err = host.Container(vmid)
|
|
||||||
|
|
||||||
}
|
if err != nil && host.Instances[instanceID] == nil { // instance is unreachable and did not exist previously
|
||||||
|
// return an error because we requested to sync an instance that was not already in the cluster
|
||||||
|
return fmt.Errorf("error retrieving %s.%d: %s", host.Name, instanceID, err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil && host.Instances[instanceID] == nil { // instance is unreachable and did not exist previously
|
// aquire lock on instance, release on return
|
||||||
// return an error because we requested to sync an instance that was not already in the cluster
|
instance.lock.Lock()
|
||||||
err_ch <- fmt.Errorf("error retrieving %s.%d: %s", host.Name, instanceID, err.Error())
|
defer instance.lock.Unlock()
|
||||||
}
|
|
||||||
|
|
||||||
// aquire lock on instance, release on return
|
wg, _ := errgroup.WithContext(context.Background())
|
||||||
instance.lock.Lock()
|
|
||||||
defer instance.lock.Unlock()
|
|
||||||
|
|
||||||
if err != nil && host.Instances[instanceID] != nil { // host is unreachable and did exist previously
|
if err != nil && host.Instances[instanceID] != nil { // host is unreachable and did exist previously
|
||||||
// assume the instance is gone and delete from cluster
|
// assume the instance is gone and delete from cluster
|
||||||
delete(host.Instances, instanceID)
|
delete(host.Instances, instanceID)
|
||||||
err_ch <- nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
host.Instances[instanceID] = instance
|
host.Instances[instanceID] = instance
|
||||||
|
|
||||||
for volid := range instance.configDisks {
|
for volid := range instance.configDisks {
|
||||||
|
wg.Go(func() error {
|
||||||
err = instance.RebuildVolume(host, volid)
|
err = instance.RebuildVolume(host, volid)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err_ch <- err
|
log.Printf("[ERR ] error rebuilding volume %s: %s", volid, err)
|
||||||
}
|
}
|
||||||
}
|
return err
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
for netid := range instance.configNets {
|
for netid := range instance.configNets {
|
||||||
|
wg.Go(func() error {
|
||||||
err = instance.RebuildNet(host, netid)
|
err = instance.RebuildNet(host, netid)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err_ch <- err
|
log.Printf("[ERR ] error rebuilding net %s: %s", netid, err)
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
}
|
return err
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
for deviceid := range instance.configHostPCIs {
|
for deviceid := range instance.configHostPCIs {
|
||||||
|
wg.Go(func() error {
|
||||||
err = instance.RebuildDevice(host, deviceid)
|
err = instance.RebuildDevice(host, deviceid)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
err_ch <- err
|
log.Printf("[ERR ] error rebuilding pci %s: %s", deviceid, err)
|
||||||
}
|
}
|
||||||
|
return err
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
err = wg.Wait()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if instance.Type == VM {
|
||||||
|
err = instance.RebuildBoot(host)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[ERR ] error rebuilding boot: %s", err)
|
||||||
}
|
}
|
||||||
|
return err
|
||||||
if instance.Type == VM {
|
} else {
|
||||||
err = instance.RebuildBoot(host)
|
return nil
|
||||||
if err != nil {
|
}
|
||||||
err_ch <- err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
err_ch <- nil
|
|
||||||
}()
|
|
||||||
|
|
||||||
return <-err_ch
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (instance *Instance) RebuildVolume(host *Node, volid string) error {
|
func (instance *Instance) RebuildVolume(host *Node, volid string) error {
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ go 1.26.2
|
|||||||
require (
|
require (
|
||||||
github.com/gin-gonic/gin v1.12.0
|
github.com/gin-gonic/gin v1.12.0
|
||||||
github.com/luthermonson/go-proxmox v0.7.0
|
github.com/luthermonson/go-proxmox v0.7.0
|
||||||
|
golang.org/x/sync v0.20.0
|
||||||
proxmoxaas-common-lib v0.0.0
|
proxmoxaas-common-lib v0.0.0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user