hostpool.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456
  1. package hostpool
  2. import (
  3. "log"
  4. "math"
  5. "math/rand"
  6. "sync"
  7. "time"
  8. )
  9. // --- timer: this just exists for testing
  10. type timer interface {
  11. between(time.Time, time.Time) time.Duration
  12. }
  13. type realTimer struct{}
  14. // --- Response interfaces and structs ----
  15. type HostPoolResponse interface {
  16. Host() string
  17. Mark(error)
  18. hostPool() HostPool
  19. }
  20. type standardHostPoolResponse struct {
  21. host string
  22. sync.Once
  23. pool HostPool
  24. }
  25. type epsilonHostPoolResponse struct {
  26. standardHostPoolResponse
  27. started time.Time
  28. ended time.Time
  29. }
  30. // --- HostPool structs and interfaces ----
  31. type HostPool interface {
  32. Get() HostPoolResponse
  33. // keep the marks separate so we can override independently
  34. markSuccess(HostPoolResponse)
  35. markFailed(HostPoolResponse)
  36. ResetAll()
  37. Hosts() []string
  38. }
  39. type standardHostPool struct {
  40. sync.RWMutex
  41. hosts map[string]*hostEntry
  42. hostList []*hostEntry
  43. initialRetryDelay time.Duration
  44. maxRetryInterval time.Duration
  45. nextHostIndex int
  46. }
  47. type epsilonGreedyHostPool struct {
  48. standardHostPool // TODO - would be nifty if we could embed HostPool and Locker interfaces
  49. epsilon float32 // this is our exploration factor
  50. decayDuration time.Duration
  51. EpsilonValueCalculator // embed the epsilonValueCalculator
  52. timer
  53. }
  54. // --- hostEntry - this is due to get upgraded
  55. type hostEntry struct {
  56. host string
  57. nextRetry time.Time
  58. retryCount int16
  59. retryDelay time.Duration
  60. dead bool
  61. epsilonCounts []int64
  62. epsilonValues []int64
  63. epsilonIndex int
  64. epsilonValue float64
  65. epsilonPercentage float64
  66. }
  67. // --- Value Calculators -----------------
  68. type EpsilonValueCalculator interface {
  69. CalcValueFromAvgResponseTime(float64) float64
  70. }
  71. type LinearEpsilonValueCalculator struct{}
  72. type LogEpsilonValueCalculator struct{ LinearEpsilonValueCalculator }
  73. type PolynomialEpsilonValueCalculator struct {
  74. LinearEpsilonValueCalculator
  75. exp float64 // the exponent to which we will raise the value to reweight
  76. }
  77. // ------ constants -------------------
  78. const epsilonBuckets = 120
  79. const epsilonDecay = 0.90 // decay the exploration rate
  80. const minEpsilon = 0.01 // explore one percent of the time
  81. const initialEpsilon = 0.3
  82. const defaultDecayDuration = time.Duration(5) * time.Minute
  83. func New(hosts []string) *standardHostPool {
  84. p := &standardHostPool{
  85. hosts: make(map[string]*hostEntry, len(hosts)),
  86. hostList: make([]*hostEntry, len(hosts)),
  87. initialRetryDelay: time.Duration(30) * time.Second,
  88. maxRetryInterval: time.Duration(900) * time.Second,
  89. }
  90. for i, h := range hosts {
  91. e := &hostEntry{
  92. host: h,
  93. retryDelay: p.initialRetryDelay,
  94. }
  95. p.hosts[h] = e
  96. p.hostList[i] = e
  97. }
  98. return p
  99. }
  100. func (r *standardHostPoolResponse) Host() string {
  101. return r.host
  102. }
  103. func (r *standardHostPoolResponse) hostPool() HostPool {
  104. return r.pool
  105. }
  106. func (r *standardHostPoolResponse) Mark(err error) {
  107. r.Do(func() {
  108. doMark(err, r)
  109. })
  110. }
  111. func doMark(err error, r HostPoolResponse) {
  112. if err == nil {
  113. r.hostPool().markSuccess(r)
  114. } else {
  115. r.hostPool().markFailed(r)
  116. }
  117. }
  118. func (r *epsilonHostPoolResponse) Mark(err error) {
  119. r.Do(func() {
  120. r.ended = time.Now()
  121. doMark(err, r)
  122. })
  123. }
  124. // Epsilon Greedy is an algorithim that allows HostPool not only to track failure state,
  125. // but also to learn about "better" options in terms of speed, and to pick from available hosts
  126. // based on a percentage of how well they perform. This gives a weighted request rate to better
  127. // performing hosts, while still distributing requests to all hosts (proportionate to their performance)
  128. //
  129. // After enabling Epsilon Greedy, hosts must be marked for sucess along with a time value representing
  130. // how fast (or slow) that host was.
  131. //
  132. // host := pool.Get()
  133. // start := time.Now()
  134. // ..... do work with host
  135. // duration = time.Now().Sub(start)
  136. // pool.MarkSuccessWithTime(host, duration)
  137. //
  138. // a good overview of Epsilon Greedy is here http://stevehanov.ca/blog/index.php?id=132
  139. //
  140. // decayDuration may be set to 0 to use the default value of 5 minutes
  141. func NewEpsilonGreedy(hosts []string, decayDuration time.Duration, calc EpsilonValueCalculator) *epsilonGreedyHostPool {
  142. if decayDuration <= 0 {
  143. decayDuration = defaultDecayDuration
  144. }
  145. p := &epsilonGreedyHostPool{
  146. standardHostPool: *New(hosts),
  147. epsilon: float32(initialEpsilon),
  148. decayDuration: decayDuration,
  149. EpsilonValueCalculator: calc,
  150. timer: &realTimer{},
  151. }
  152. // allocate structures
  153. for _, h := range p.hostList {
  154. h.epsilonCounts = make([]int64, epsilonBuckets)
  155. h.epsilonValues = make([]int64, epsilonBuckets)
  156. }
  157. go p.epsilonGreedyDecay()
  158. return p
  159. }
  160. func (rt *realTimer) between(start time.Time, end time.Time) time.Duration {
  161. return end.Sub(start)
  162. }
  163. func (p *epsilonGreedyHostPool) SetEpsilon(newEpsilon float32) {
  164. p.Lock()
  165. defer p.Unlock()
  166. p.epsilon = newEpsilon
  167. }
  168. func (p *epsilonGreedyHostPool) epsilonGreedyDecay() {
  169. durationPerBucket := p.decayDuration / epsilonBuckets
  170. ticker := time.Tick(durationPerBucket)
  171. for {
  172. <-ticker
  173. p.performEpsilonGreedyDecay()
  174. }
  175. }
  176. func (p *epsilonGreedyHostPool) performEpsilonGreedyDecay() {
  177. p.Lock()
  178. for _, h := range p.hostList {
  179. h.epsilonIndex += 1
  180. h.epsilonIndex = h.epsilonIndex % epsilonBuckets
  181. h.epsilonCounts[h.epsilonIndex] = 0
  182. h.epsilonValues[h.epsilonIndex] = 0
  183. }
  184. p.Unlock()
  185. }
  186. // return an upstream entry from the HostPool
  187. func (p *standardHostPool) Get() HostPoolResponse {
  188. p.Lock()
  189. defer p.Unlock()
  190. host := p.getRoundRobin()
  191. return &standardHostPoolResponse{host: host, pool: p}
  192. }
  193. func (p *epsilonGreedyHostPool) Get() HostPoolResponse {
  194. p.Lock()
  195. defer p.Unlock()
  196. host := p.getEpsilonGreedy()
  197. started := time.Now()
  198. return &epsilonHostPoolResponse{
  199. standardHostPoolResponse: standardHostPoolResponse{host: host, pool: p},
  200. started: started,
  201. }
  202. }
  203. func (p *standardHostPool) getRoundRobin() string {
  204. now := time.Now()
  205. hostCount := len(p.hostList)
  206. for i := range p.hostList {
  207. // iterate via sequenece from where we last iterated
  208. currentIndex := (i + p.nextHostIndex) % hostCount
  209. h := p.hostList[currentIndex]
  210. if !h.dead {
  211. p.nextHostIndex = currentIndex + 1
  212. return h.host
  213. }
  214. if h.nextRetry.Before(now) {
  215. h.willRetryHost(p.maxRetryInterval)
  216. p.nextHostIndex = currentIndex + 1
  217. return h.host
  218. }
  219. }
  220. // all hosts are down. re-add them
  221. p.doResetAll()
  222. p.nextHostIndex = 0
  223. return p.hostList[0].host
  224. }
  225. func (p *epsilonGreedyHostPool) getEpsilonGreedy() string {
  226. var hostToUse *hostEntry
  227. // this is our exploration phase
  228. if rand.Float32() < p.epsilon {
  229. p.epsilon = p.epsilon * epsilonDecay
  230. if p.epsilon < minEpsilon {
  231. p.epsilon = minEpsilon
  232. }
  233. return p.getRoundRobin()
  234. }
  235. // calculate values for each host in the 0..1 range (but not ormalized)
  236. var possibleHosts []*hostEntry
  237. now := time.Now()
  238. var sumValues float64
  239. for _, h := range p.hostList {
  240. if h.canTryHost(now) {
  241. v := h.getWeightedAverageResponseTime()
  242. if v > 0 {
  243. ev := p.CalcValueFromAvgResponseTime(v)
  244. h.epsilonValue = ev
  245. sumValues += ev
  246. possibleHosts = append(possibleHosts, h)
  247. }
  248. }
  249. }
  250. if len(possibleHosts) != 0 {
  251. // now normalize to the 0..1 range to get a percentage
  252. for _, h := range possibleHosts {
  253. h.epsilonPercentage = h.epsilonValue / sumValues
  254. }
  255. // do a weighted random choice among hosts
  256. ceiling := 0.0
  257. pickPercentage := rand.Float64()
  258. for _, h := range possibleHosts {
  259. ceiling += h.epsilonPercentage
  260. if pickPercentage <= ceiling {
  261. hostToUse = h
  262. break
  263. }
  264. }
  265. }
  266. if hostToUse == nil {
  267. if len(possibleHosts) != 0 {
  268. log.Println("Failed to randomly choose a host, Dan loses")
  269. }
  270. return p.getRoundRobin()
  271. }
  272. if hostToUse.dead {
  273. hostToUse.willRetryHost(p.maxRetryInterval)
  274. }
  275. return hostToUse.host
  276. }
  277. func (h *hostEntry) canTryHost(now time.Time) bool {
  278. if !h.dead {
  279. return true
  280. }
  281. if h.nextRetry.Before(now) {
  282. return true
  283. }
  284. return false
  285. }
  286. func (h *hostEntry) willRetryHost(maxRetryInterval time.Duration) {
  287. h.retryCount += 1
  288. newDelay := h.retryDelay * 2
  289. if newDelay < maxRetryInterval {
  290. h.retryDelay = newDelay
  291. } else {
  292. h.retryDelay = maxRetryInterval
  293. }
  294. h.nextRetry = time.Now().Add(h.retryDelay)
  295. }
  296. func (h *hostEntry) getWeightedAverageResponseTime() float64 {
  297. var value float64
  298. var lastValue float64
  299. // start at 1 so we start with the oldest entry
  300. for i := 1; i <= epsilonBuckets; i += 1 {
  301. pos := (h.epsilonIndex + i) % epsilonBuckets
  302. bucketCount := h.epsilonCounts[pos]
  303. // Changing the line below to what I think it should be to get the weights right
  304. weight := float64(i) / float64(epsilonBuckets)
  305. if bucketCount > 0 {
  306. currentValue := float64(h.epsilonValues[pos]) / float64(bucketCount)
  307. value += currentValue * weight
  308. lastValue = currentValue
  309. } else {
  310. value += lastValue * weight
  311. }
  312. }
  313. return value
  314. }
  315. func (p *standardHostPool) ResetAll() {
  316. p.Lock()
  317. defer p.Unlock()
  318. p.doResetAll()
  319. }
  320. // this actually performs the logic to reset,
  321. // and should only be called when the lock has
  322. // already been acquired
  323. func (p *standardHostPool) doResetAll() {
  324. for _, h := range p.hosts {
  325. h.dead = false
  326. }
  327. }
  328. func (p *standardHostPool) markSuccess(hostR HostPoolResponse) {
  329. host := hostR.Host()
  330. p.Lock()
  331. defer p.Unlock()
  332. h, ok := p.hosts[host]
  333. if !ok {
  334. log.Fatalf("host %s not in HostPool %v", host, p.Hosts())
  335. }
  336. h.dead = false
  337. }
  338. func (p *epsilonGreedyHostPool) markSuccess(hostR HostPoolResponse) {
  339. // first do the base markSuccess - a little redundant with host lookup but cleaner than repeating logic
  340. p.standardHostPool.markSuccess(hostR)
  341. eHostR, ok := hostR.(*epsilonHostPoolResponse)
  342. if !ok {
  343. log.Printf("Incorrect type in eps markSuccess!") // TODO reflection to print out offending type
  344. return
  345. }
  346. host := eHostR.host
  347. duration := p.between(eHostR.started, eHostR.ended)
  348. p.Lock()
  349. defer p.Unlock()
  350. h, ok := p.hosts[host]
  351. if !ok {
  352. log.Fatalf("host %s not in HostPool %v", host, p.Hosts())
  353. }
  354. h.epsilonCounts[h.epsilonIndex]++
  355. h.epsilonValues[h.epsilonIndex] += int64(duration.Seconds() * 1000)
  356. }
  357. func (p *standardHostPool) markFailed(hostR HostPoolResponse) {
  358. host := hostR.Host()
  359. p.Lock()
  360. defer p.Unlock()
  361. h, ok := p.hosts[host]
  362. if !ok {
  363. log.Fatalf("host %s not in HostPool %v", host, p.Hosts())
  364. }
  365. if !h.dead {
  366. h.dead = true
  367. h.retryCount = 0
  368. h.retryDelay = p.initialRetryDelay
  369. h.nextRetry = time.Now().Add(h.retryDelay)
  370. }
  371. }
  372. func (p *standardHostPool) Hosts() []string {
  373. hosts := make([]string, len(p.hosts))
  374. for host, _ := range p.hosts {
  375. hosts = append(hosts, host)
  376. }
  377. return hosts
  378. }
  379. // -------- Epsilon Value Calculators ----------
  380. func (c *LinearEpsilonValueCalculator) CalcValueFromAvgResponseTime(v float64) float64 {
  381. return 1.0 / v
  382. }
  383. func (c *LogEpsilonValueCalculator) CalcValueFromAvgResponseTime(v float64) float64 {
  384. return math.Log(c.LinearEpsilonValueCalculator.CalcValueFromAvgResponseTime(v))
  385. }
  386. func (c *PolynomialEpsilonValueCalculator) CalcValueFromAvgResponseTime(v float64) float64 {
  387. return math.Pow(c.LinearEpsilonValueCalculator.CalcValueFromAvgResponseTime(v), c.exp)
  388. }