From 94c70c77534ae1f99fe64f501b6f9f8c7bdda619 Mon Sep 17 00:00:00 2001
From: coldWater <254244460@qq.com>
Date: Fri, 15 Mar 2024 18:59:11 +0800
Subject: [PATCH] Refactor graceful manager, fix misused WaitGroup (#29738)

Follow #29629

(cherry picked from commit d08f4360c96e130e0454b76ecef9405f2bd312a1)
---
 modules/graceful/manager.go         |  5 ++-
 modules/graceful/manager_common.go  |  5 +--
 modules/graceful/manager_unix.go    | 44 ++++++++++++------------
 modules/graceful/manager_windows.go | 52 +++++++++++++++--------------
 4 files changed, 55 insertions(+), 51 deletions(-)

diff --git a/modules/graceful/manager.go b/modules/graceful/manager.go
index f3f412863a..3f1115066a 100644
--- a/modules/graceful/manager.go
+++ b/modules/graceful/manager.go
@@ -233,7 +233,10 @@ func (g *Manager) setStateTransition(old, new state) bool {
 // At the moment the total number of servers (numberOfServersToCreate) are pre-defined as a const before global init,
 // so this function MUST be called if a server is not used.
 func (g *Manager) InformCleanup() {
-	g.createServerWaitGroup.Done()
+	g.createServerCond.L.Lock()
+	defer g.createServerCond.L.Unlock()
+	g.createdServer++
+	g.createServerCond.Signal()
 }
 
 // Done allows the manager to be viewed as a context.Context, it returns a channel that is closed when the server is finished terminating
diff --git a/modules/graceful/manager_common.go b/modules/graceful/manager_common.go
index 27196e1531..f6dbcc748d 100644
--- a/modules/graceful/manager_common.go
+++ b/modules/graceful/manager_common.go
@@ -42,8 +42,9 @@ type Manager struct {
 	terminateCtxCancel     context.CancelFunc
 	managerCtxCancel       context.CancelFunc
 	runningServerWaitGroup sync.WaitGroup
-	createServerWaitGroup  sync.WaitGroup
 	terminateWaitGroup     sync.WaitGroup
+	createServerCond       sync.Cond
+	createdServer          int
 	shutdownRequested      chan struct{}
 
 	toRunAtShutdown  []func()
@@ -52,7 +53,7 @@ type Manager struct {
 
 func newGracefulManager(ctx context.Context) *Manager {
 	manager := &Manager{ctx: ctx, shutdownRequested: make(chan struct{})}
-	manager.createServerWaitGroup.Add(numberOfServersToCreate)
+	manager.createServerCond.L = &sync.Mutex{}
 	manager.prepare(ctx)
 	manager.start()
 	return manager
diff --git a/modules/graceful/manager_unix.go b/modules/graceful/manager_unix.go
index edf5fc248f..f49c42650c 100644
--- a/modules/graceful/manager_unix.go
+++ b/modules/graceful/manager_unix.go
@@ -57,20 +57,27 @@ func (g *Manager) start() {
 	// Handle clean up of unused provided listeners	and delayed start-up
 	startupDone := make(chan struct{})
 	go func() {
-		defer close(startupDone)
-		// Wait till we're done getting all the listeners and then close the unused ones
-		func() {
-			// FIXME: there is a fundamental design problem of the "manager" and the "wait group".
-			// If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned
-			// There is no clear solution besides a complete rewriting of the "manager"
-			defer func() {
-				_ = recover()
-			}()
-			g.createServerWaitGroup.Wait()
+		defer func() {
+			close(startupDone)
+			// Close the unused listeners and ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function
+			_ = CloseProvidedListeners()
 		}()
-		// Ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function
-		_ = CloseProvidedListeners()
-		g.notify(readyMsg)
+		// Wait for all servers to be created
+		g.createServerCond.L.Lock()
+		for {
+			if g.createdServer >= numberOfServersToCreate {
+				g.createServerCond.L.Unlock()
+				g.notify(readyMsg)
+				return
+			}
+			select {
+			case <-g.IsShutdown():
+				g.createServerCond.L.Unlock()
+				return
+			default:
+			}
+			g.createServerCond.Wait()
+		}
 	}()
 	if setting.StartupTimeout > 0 {
 		go func() {
@@ -78,16 +85,7 @@ func (g *Manager) start() {
 			case <-startupDone:
 				return
 			case <-g.IsShutdown():
-				func() {
-					// When WaitGroup counter goes negative it will panic - we don't care about this so we can just ignore it.
-					defer func() {
-						_ = recover()
-					}()
-					// Ensure that the createServerWaitGroup stops waiting
-					for {
-						g.createServerWaitGroup.Done()
-					}
-				}()
+				g.createServerCond.Signal()
 				return
 			case <-time.After(setting.StartupTimeout):
 				log.Error("Startup took too long! Shutting down")
diff --git a/modules/graceful/manager_windows.go b/modules/graceful/manager_windows.go
index ecf30af3f3..d776e0e9f9 100644
--- a/modules/graceful/manager_windows.go
+++ b/modules/graceful/manager_windows.go
@@ -149,33 +149,35 @@ hammerLoop:
 func (g *Manager) awaitServer(limit time.Duration) bool {
 	c := make(chan struct{})
 	go func() {
-		defer close(c)
-		func() {
-			// FIXME: there is a fundamental design problem of the "manager" and the "wait group".
-			// If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned
-			// There is no clear solution besides a complete rewriting of the "manager"
-			defer func() {
-				_ = recover()
-			}()
-			g.createServerWaitGroup.Wait()
-		}()
+		g.createServerCond.L.Lock()
+		for {
+			if g.createdServer >= numberOfServersToCreate {
+				g.createServerCond.L.Unlock()
+				close(c)
+				return
+			}
+			select {
+			case <-g.IsShutdown():
+				g.createServerCond.L.Unlock()
+				return
+			default:
+			}
+			g.createServerCond.Wait()
+		}
 	}()
+
+	var tc <-chan time.Time
 	if limit > 0 {
-		select {
-		case <-c:
-			return true // completed normally
-		case <-time.After(limit):
-			return false // timed out
-		case <-g.IsShutdown():
-			return false
-		}
-	} else {
-		select {
-		case <-c:
-			return true // completed normally
-		case <-g.IsShutdown():
-			return false
-		}
+		tc = time.After(limit)
+	}
+	select {
+	case <-c:
+		return true // completed normally
+	case <-tc:
+		return false // timed out
+	case <-g.IsShutdown():
+		g.createServerCond.Signal()
+		return false
 	}
 }