metrics.go 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653
  1. // Copyright 2014 The Cockroach Authors.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  12. // implied. See the License for the specific language governing
  13. // permissions and limitations under the License. See the AUTHORS file
  14. // for names of contributors.
  15. //
  16. // Author: Tyler Neely (t@jujit.su)
  17. // IMPORTANT: only subscribe to the metric stream
  18. // using buffered channels that are regularly
  19. // flushed, as reaper will NOT block while trying
  20. // to send metrics to a subscriber, and will ignore
  21. // a subscriber if they fail to clear their channel
  22. // 3 times in a row!
  23. package loghisto
  24. import (
  25. "errors"
  26. "fmt"
  27. "math"
  28. "runtime"
  29. "sort"
  30. "sync"
  31. "sync/atomic"
  32. "time"
  33. "github.com/golang/glog"
  34. )
  35. const (
  36. // precision effects the bucketing used during histogram value compression.
  37. precision = 100
  38. )
  39. // ProcessedMetricSet contains human-readable metrics that may also be
  40. // suitable for storage in time-series databases.
  41. type ProcessedMetricSet struct {
  42. Time time.Time
  43. Metrics map[string]float64
  44. }
  45. // RawMetricSet contains metrics in a form that supports generation of
  46. // percentiles and other rich statistics.
  47. type RawMetricSet struct {
  48. Time time.Time
  49. Counters map[string]uint64
  50. Rates map[string]uint64
  51. Histograms map[string]map[int16]*uint64
  52. Gauges map[string]float64
  53. }
  54. // TimerToken facilitates concurrent timings of durations of the same label.
  55. type TimerToken struct {
  56. Name string
  57. Start time.Time
  58. MetricSystem *MetricSystem
  59. }
  60. // proportion is a compact value with a corresponding count of
  61. // occurrences in this interval.
  62. type proportion struct {
  63. Value float64
  64. Count uint64
  65. }
  66. // proportionArray is a sortable collection of proportion types.
  67. type proportionArray []proportion
  68. // MetricSystem facilitates the collection and distribution of metrics.
  69. type MetricSystem struct {
  70. // percentiles is a mapping from labels to desired percentiles to be
  71. // calculated by the MetricSystem
  72. percentiles map[string]float64
  73. // interval is the duration between collections and broadcasts of metrics
  74. // to subscribers.
  75. interval time.Duration
  76. // subscribeToRawMetrics allows subscription to a RawMetricSet generated
  77. // by reaper at the end of each interval on a sent channel.
  78. subscribeToRawMetrics chan chan *RawMetricSet
  79. // unsubscribeFromRawMetrics allows subscribers to unsubscribe from
  80. // receiving a RawMetricSet on the sent channel.
  81. unsubscribeFromRawMetrics chan chan *RawMetricSet
  82. // subscribeToProcessedMetrics allows subscription to a ProcessedMetricSet
  83. // generated by reaper at the end of each interval on a sent channel.
  84. subscribeToProcessedMetrics chan chan *ProcessedMetricSet
  85. // unsubscribeFromProcessedMetrics allows subscribers to unsubscribe from
  86. // receiving a ProcessedMetricSet on the sent channel.
  87. unsubscribeFromProcessedMetrics chan chan *ProcessedMetricSet
  88. // rawSubscribers stores current subscribers to RawMetrics
  89. rawSubscribers map[chan *RawMetricSet]struct{}
  90. // rawBadSubscribers tracks misbehaving subscribers who do not clear their
  91. // subscription channels regularly.
  92. rawBadSubscribers map[chan *RawMetricSet]int
  93. // processedSubscribers stores current subscribers to ProcessedMetrics
  94. processedSubscribers map[chan *ProcessedMetricSet]struct{}
  95. // processedBadSubscribers tracks misbehaving subscribers who do not clear
  96. // their subscription channels regularly.
  97. processedBadSubscribers map[chan *ProcessedMetricSet]int
  98. // subscribersMu controls access to subscription structures
  99. subscribersMu sync.RWMutex
  100. // counterStore maintains the total counts of counters.
  101. counterStore map[string]*uint64
  102. counterStoreMu sync.RWMutex
  103. // counterCache aggregates new Counters until they are collected by reaper().
  104. counterCache map[string]*uint64
  105. // counterMu controls access to counterCache.
  106. counterMu sync.RWMutex
  107. // histogramCache aggregates Histograms until they are collected by reaper().
  108. histogramCache map[string]map[int16]*uint64
  109. // histogramMu controls access to histogramCache.
  110. histogramMu sync.RWMutex
  111. // histogramCountStore keeps track of aggregate counts and sums for aggregate
  112. // mean calculation.
  113. histogramCountStore map[string]*uint64
  114. // histogramCountMu controls access to the histogramCountStore.
  115. histogramCountMu sync.RWMutex
  116. // gaugeFuncs maps metrics to functions used for calculating their value
  117. gaugeFuncs map[string]func() float64
  118. // gaugeFuncsMu controls access to the gaugeFuncs map.
  119. gaugeFuncsMu sync.Mutex
  120. // Has reaper() been started?
  121. reaping bool
  122. // Close this to bring down this MetricSystem
  123. shutdownChan chan struct{}
  124. }
  125. // Metrics is the default metric system, which collects and broadcasts metrics
  126. // to subscribers once every 60 seconds. Also includes default system stats.
  127. var Metrics = NewMetricSystem(60*time.Second, true)
  128. // NewMetricSystem returns a new metric system that collects and broadcasts
  129. // metrics after each interval.
  130. func NewMetricSystem(interval time.Duration, sysStats bool) *MetricSystem {
  131. ms := &MetricSystem{
  132. percentiles: map[string]float64{
  133. "%s_min": 0,
  134. "%s_50": .5,
  135. "%s_75": .75,
  136. "%s_90": .9,
  137. "%s_95": .95,
  138. "%s_99": .99,
  139. "%s_99.9": .999,
  140. "%s_99.99": .9999,
  141. "%s_max": 1,
  142. },
  143. interval: interval,
  144. subscribeToRawMetrics: make(chan chan *RawMetricSet, 64),
  145. unsubscribeFromRawMetrics: make(chan chan *RawMetricSet, 64),
  146. subscribeToProcessedMetrics: make(chan chan *ProcessedMetricSet, 64),
  147. unsubscribeFromProcessedMetrics: make(chan chan *ProcessedMetricSet, 64),
  148. rawSubscribers: make(map[chan *RawMetricSet]struct{}),
  149. rawBadSubscribers: make(map[chan *RawMetricSet]int),
  150. processedSubscribers: make(map[chan *ProcessedMetricSet]struct{}),
  151. processedBadSubscribers: make(map[chan *ProcessedMetricSet]int),
  152. counterStore: make(map[string]*uint64),
  153. counterCache: make(map[string]*uint64),
  154. histogramCache: make(map[string]map[int16]*uint64),
  155. histogramCountStore: make(map[string]*uint64),
  156. gaugeFuncs: make(map[string]func() float64),
  157. shutdownChan: make(chan struct{}),
  158. }
  159. if sysStats {
  160. ms.gaugeFuncsMu.Lock()
  161. ms.gaugeFuncs["sys.Alloc"] = func() float64 {
  162. memStats := new(runtime.MemStats)
  163. runtime.ReadMemStats(memStats)
  164. return float64(memStats.Alloc)
  165. }
  166. ms.gaugeFuncs["sys.NumGC"] = func() float64 {
  167. memStats := new(runtime.MemStats)
  168. runtime.ReadMemStats(memStats)
  169. return float64(memStats.NumGC)
  170. }
  171. ms.gaugeFuncs["sys.PauseTotalNs"] = func() float64 {
  172. memStats := new(runtime.MemStats)
  173. runtime.ReadMemStats(memStats)
  174. return float64(memStats.PauseTotalNs)
  175. }
  176. ms.gaugeFuncs["sys.NumGoroutine"] = func() float64 {
  177. return float64(runtime.NumGoroutine())
  178. }
  179. ms.gaugeFuncsMu.Unlock()
  180. }
  181. return ms
  182. }
  183. // SpecifyPercentiles allows users to override the default collected
  184. // and reported percentiles.
  185. func (ms *MetricSystem) SpecifyPercentiles(percentiles map[string]float64) {
  186. ms.percentiles = percentiles
  187. }
  188. // SubscribeToRawMetrics registers a channel to receive RawMetricSets
  189. // periodically generated by reaper at each interval.
  190. func (ms *MetricSystem) SubscribeToRawMetrics(metricStream chan *RawMetricSet) {
  191. ms.subscribeToRawMetrics <- metricStream
  192. }
  193. // UnsubscribeFromRawMetrics registers a channel to receive RawMetricSets
  194. // periodically generated by reaper at each interval.
  195. func (ms *MetricSystem) UnsubscribeFromRawMetrics(
  196. metricStream chan *RawMetricSet) {
  197. ms.unsubscribeFromRawMetrics <- metricStream
  198. }
  199. // SubscribeToProcessedMetrics registers a channel to receive
  200. // ProcessedMetricSets periodically generated by reaper at each interval.
  201. func (ms *MetricSystem) SubscribeToProcessedMetrics(
  202. metricStream chan *ProcessedMetricSet) {
  203. ms.subscribeToProcessedMetrics <- metricStream
  204. }
  205. // UnsubscribeFromProcessedMetrics registers a channel to receive
  206. // ProcessedMetricSets periodically generated by reaper at each interval.
  207. func (ms *MetricSystem) UnsubscribeFromProcessedMetrics(
  208. metricStream chan *ProcessedMetricSet) {
  209. ms.unsubscribeFromProcessedMetrics <- metricStream
  210. }
  211. // StartTimer begins a timer and returns a token which is required for halting
  212. // the timer. This allows for concurrent timings under the same name.
  213. func (ms *MetricSystem) StartTimer(name string) TimerToken {
  214. return TimerToken{
  215. Name: name,
  216. Start: time.Now(),
  217. MetricSystem: ms,
  218. }
  219. }
  220. // Stop stops a timer given by StartTimer, submits a Histogram of its duration
  221. // in nanoseconds, and returns its duration in nanoseconds.
  222. func (tt *TimerToken) Stop() time.Duration {
  223. duration := time.Since(tt.Start)
  224. tt.MetricSystem.Histogram(tt.Name, float64(duration.Nanoseconds()))
  225. return duration
  226. }
  227. // Counter is used for recording a running count of the total occurrences of
  228. // a particular event. A rate is also exported for the amount that a counter
  229. // has increased during an interval of this MetricSystem.
  230. func (ms *MetricSystem) Counter(name string, amount uint64) {
  231. ms.counterMu.RLock()
  232. _, exists := ms.counterCache[name]
  233. // perform lock promotion when we need more control
  234. if exists {
  235. atomic.AddUint64(ms.counterCache[name], amount)
  236. ms.counterMu.RUnlock()
  237. } else {
  238. ms.counterMu.RUnlock()
  239. ms.counterMu.Lock()
  240. _, syncExists := ms.counterCache[name]
  241. if !syncExists {
  242. var z uint64
  243. ms.counterCache[name] = &z
  244. }
  245. atomic.AddUint64(ms.counterCache[name], amount)
  246. ms.counterMu.Unlock()
  247. }
  248. }
  249. // Histogram is used for generating rich metrics, such as percentiles, from
  250. // periodically occurring continuous values.
  251. func (ms *MetricSystem) Histogram(name string, value float64) {
  252. compressedValue := compress(value)
  253. ms.histogramMu.RLock()
  254. _, present := ms.histogramCache[name][compressedValue]
  255. if present {
  256. atomic.AddUint64(ms.histogramCache[name][compressedValue], 1)
  257. ms.histogramMu.RUnlock()
  258. } else {
  259. ms.histogramMu.RUnlock()
  260. ms.histogramMu.Lock()
  261. _, syncPresent := ms.histogramCache[name][compressedValue]
  262. if !syncPresent {
  263. var z uint64
  264. _, mapPresent := ms.histogramCache[name]
  265. if !mapPresent {
  266. ms.histogramCache[name] = make(map[int16]*uint64)
  267. }
  268. ms.histogramCache[name][compressedValue] = &z
  269. }
  270. atomic.AddUint64(ms.histogramCache[name][compressedValue], 1)
  271. ms.histogramMu.Unlock()
  272. }
  273. }
  274. // RegisterGaugeFunc registers a function to be called at each interval
  275. // whose return value will be used to populate the <name> metric.
  276. func (ms *MetricSystem) RegisterGaugeFunc(name string, f func() float64) {
  277. ms.gaugeFuncsMu.Lock()
  278. ms.gaugeFuncs[name] = f
  279. ms.gaugeFuncsMu.Unlock()
  280. }
  281. // DeregisterGaugeFunc deregisters a function for the <name> metric.
  282. func (ms *MetricSystem) DeregisterGaugeFunc(name string) {
  283. ms.gaugeFuncsMu.Lock()
  284. delete(ms.gaugeFuncs, name)
  285. ms.gaugeFuncsMu.Unlock()
  286. }
  287. // compress takes a float64 and lossily shrinks it to an int16 to facilitate
  288. // bucketing of histogram values, staying within 1% of the true value. This
  289. // fails for large values of 1e142 and above, and is inaccurate for values
  290. // closer to 0 than +/- 0.51 or +/- math.Inf.
  291. func compress(value float64) int16 {
  292. i := int16(precision*math.Log(1.0+math.Abs(value)) + 0.5)
  293. if value < 0 {
  294. return -1 * i
  295. }
  296. return i
  297. }
  298. // decompress takes a lossily shrunk int16 and returns a float64 within 1% of
  299. // the original float64 passed to compress.
  300. func decompress(compressedValue int16) float64 {
  301. f := math.Exp(math.Abs(float64(compressedValue))/precision) - 1.0
  302. if compressedValue < 0 {
  303. return -1.0 * f
  304. }
  305. return f
  306. }
  307. // processHistograms derives rich metrics from histograms, currently
  308. // percentiles, sum, count, and mean.
  309. func (ms *MetricSystem) processHistograms(name string,
  310. valuesToCounts map[int16]*uint64) map[string]float64 {
  311. output := make(map[string]float64)
  312. totalSum := float64(0)
  313. totalCount := uint64(0)
  314. proportions := make([]proportion, 0, len(valuesToCounts))
  315. for compressedValue, count := range valuesToCounts {
  316. value := decompress(compressedValue)
  317. totalSum += value * float64(*count)
  318. totalCount += *count
  319. proportions = append(proportions, proportion{Value: value, Count: *count})
  320. }
  321. sumName := fmt.Sprintf("%s_sum", name)
  322. countName := fmt.Sprintf("%s_count", name)
  323. avgName := fmt.Sprintf("%s_avg", name)
  324. // increment interval sum and count
  325. output[countName] = float64(totalCount)
  326. output[sumName] = totalSum
  327. output[avgName] = totalSum / float64(totalCount)
  328. // increment aggregate sum and count
  329. ms.histogramCountMu.RLock()
  330. _, present := ms.histogramCountStore[sumName]
  331. if !present {
  332. ms.histogramCountMu.RUnlock()
  333. ms.histogramCountMu.Lock()
  334. _, syncPresent := ms.histogramCountStore[sumName]
  335. if !syncPresent {
  336. var x uint64
  337. ms.histogramCountStore[sumName] = &x
  338. var z uint64
  339. ms.histogramCountStore[countName] = &z
  340. }
  341. ms.histogramCountMu.Unlock()
  342. ms.histogramCountMu.RLock()
  343. }
  344. atomic.AddUint64(ms.histogramCountStore[sumName], uint64(totalSum))
  345. atomic.AddUint64(ms.histogramCountStore[countName], totalCount)
  346. ms.histogramCountMu.RUnlock()
  347. for label, p := range ms.percentiles {
  348. value, err := percentile(totalCount, proportions, p)
  349. if err != nil {
  350. glog.Errorf("unable to calculate percentile: %s", err)
  351. } else {
  352. output[fmt.Sprintf(label, name)] = value
  353. }
  354. }
  355. return output
  356. }
  357. // These next 3 methods are for the implementation of sort.Interface
  358. func (s proportionArray) Len() int {
  359. return len(s)
  360. }
  361. func (s proportionArray) Less(i, j int) bool {
  362. return s[i].Value < s[j].Value
  363. }
  364. func (s proportionArray) Swap(i, j int) {
  365. s[i], s[j] = s[j], s[i]
  366. }
  367. // percentile calculates a percentile represented as a float64 between 0 and 1
  368. // inclusive from a proportionArray. totalCount is the sum of all counts of
  369. // elements in the proportionArray.
  370. func percentile(totalCount uint64, proportions proportionArray,
  371. percentile float64) (float64, error) {
  372. //TODO(tyler) handle multiple percentiles at once for efficiency
  373. sort.Sort(proportions)
  374. sofar := uint64(0)
  375. for _, proportion := range proportions {
  376. sofar += proportion.Count
  377. if float64(sofar)/float64(totalCount) >= percentile {
  378. return proportion.Value, nil
  379. }
  380. }
  381. return 0, errors.New("Invalid percentile. Should be between 0 and 1.")
  382. }
  383. func (ms *MetricSystem) collectRawMetrics() *RawMetricSet {
  384. normalizedInterval := time.Unix(0, time.Now().UnixNano()/
  385. ms.interval.Nanoseconds()*
  386. ms.interval.Nanoseconds())
  387. ms.counterMu.Lock()
  388. freshCounters := ms.counterCache
  389. ms.counterCache = make(map[string]*uint64)
  390. ms.counterMu.Unlock()
  391. rates := make(map[string]uint64)
  392. for name, count := range freshCounters {
  393. rates[name] = *count
  394. }
  395. counters := make(map[string]uint64)
  396. ms.counterStoreMu.RLock()
  397. // update counters
  398. for name, count := range freshCounters {
  399. _, exists := ms.counterStore[name]
  400. // only take a write lock when it's a totally new counter
  401. if !exists {
  402. ms.counterStoreMu.RUnlock()
  403. ms.counterStoreMu.Lock()
  404. _, syncExists := ms.counterStore[name]
  405. if !syncExists {
  406. var z uint64
  407. ms.counterStore[name] = &z
  408. }
  409. ms.counterStoreMu.Unlock()
  410. ms.counterStoreMu.RLock()
  411. }
  412. atomic.AddUint64(ms.counterStore[name], *count)
  413. }
  414. // copy counters for export
  415. for name, count := range ms.counterStore {
  416. counters[name] = *count
  417. }
  418. ms.counterStoreMu.RUnlock()
  419. ms.histogramMu.Lock()
  420. histograms := ms.histogramCache
  421. ms.histogramCache = make(map[string]map[int16]*uint64)
  422. ms.histogramMu.Unlock()
  423. ms.gaugeFuncsMu.Lock()
  424. gauges := make(map[string]float64)
  425. for name, f := range ms.gaugeFuncs {
  426. gauges[name] = f()
  427. }
  428. ms.gaugeFuncsMu.Unlock()
  429. return &RawMetricSet{
  430. Time: normalizedInterval,
  431. Counters: counters,
  432. Rates: rates,
  433. Histograms: histograms,
  434. Gauges: gauges,
  435. }
  436. }
  437. // processMetrics (potentially slowly) creates human consumable metrics from a
  438. // RawMetricSet, deriving rich statistics from histograms such as percentiles.
  439. func (ms *MetricSystem) processMetrics(
  440. rawMetrics *RawMetricSet) *ProcessedMetricSet {
  441. metrics := make(map[string]float64)
  442. for name, count := range rawMetrics.Counters {
  443. metrics[name] = float64(count)
  444. }
  445. for name, count := range rawMetrics.Rates {
  446. metrics[fmt.Sprintf("%s_rate", name)] = float64(count)
  447. }
  448. for name, valuesToCounts := range rawMetrics.Histograms {
  449. for histoName, histoValue := range ms.processHistograms(name, valuesToCounts) {
  450. metrics[histoName] = histoValue
  451. }
  452. }
  453. for name, value := range rawMetrics.Gauges {
  454. metrics[name] = value
  455. }
  456. return &ProcessedMetricSet{Time: rawMetrics.Time, Metrics: metrics}
  457. }
  458. func (ms *MetricSystem) updateSubscribers() {
  459. ms.subscribersMu.Lock()
  460. defer ms.subscribersMu.Unlock()
  461. for {
  462. select {
  463. case subscriber := <-ms.subscribeToRawMetrics:
  464. ms.rawSubscribers[subscriber] = struct{}{}
  465. case unsubscriber := <-ms.unsubscribeFromRawMetrics:
  466. delete(ms.rawSubscribers, unsubscriber)
  467. case subscriber := <-ms.subscribeToProcessedMetrics:
  468. ms.processedSubscribers[subscriber] = struct{}{}
  469. case unsubscriber := <-ms.unsubscribeFromProcessedMetrics:
  470. delete(ms.processedSubscribers, unsubscriber)
  471. default: // no changes in subscribers
  472. return
  473. }
  474. }
  475. }
  476. // reaper wakes up every <interval> seconds,
  477. // collects and processes metrics, and pushes
  478. // them to the corresponding subscribing channels.
  479. func (ms *MetricSystem) reaper() {
  480. ms.reaping = true
  481. // create goroutine pool to handle multiple processing tasks at once
  482. processChan := make(chan func(), 16)
  483. for i := 0; i < int(math.Max(float64(runtime.NumCPU()/4), 4)); i++ {
  484. go func() {
  485. for {
  486. c, ok := <-processChan
  487. if !ok {
  488. return
  489. }
  490. c()
  491. }
  492. }()
  493. }
  494. // begin reaper main loop
  495. for {
  496. // sleep until the next interval, or die if shutdownChan is closed
  497. tts := ms.interval.Nanoseconds() -
  498. (time.Now().UnixNano() % ms.interval.Nanoseconds())
  499. select {
  500. case <-time.After(time.Duration(tts)):
  501. case <-ms.shutdownChan:
  502. ms.reaping = false
  503. close(processChan)
  504. return
  505. }
  506. rawMetrics := ms.collectRawMetrics()
  507. ms.updateSubscribers()
  508. // broadcast raw metrics
  509. for subscriber := range ms.rawSubscribers {
  510. // new subscribers get all counters, otherwise just the new diffs
  511. select {
  512. case subscriber <- rawMetrics:
  513. delete(ms.rawBadSubscribers, subscriber)
  514. default:
  515. ms.rawBadSubscribers[subscriber]++
  516. glog.Error("a raw subscriber has allowed their channel to fill up. ",
  517. "dropping their metrics on the floor rather than blocking.")
  518. if ms.rawBadSubscribers[subscriber] >= 2 {
  519. glog.Error("this raw subscriber has caused dropped metrics at ",
  520. "least 3 times in a row. closing the channel.")
  521. delete(ms.rawSubscribers, subscriber)
  522. close(subscriber)
  523. }
  524. }
  525. }
  526. // Perform the rest in another goroutine since processing is not
  527. // gauranteed to complete before the interval is up.
  528. sendProcessed := func() {
  529. // this is potentially expensive if there is a massive number of metrics
  530. processedMetrics := ms.processMetrics(rawMetrics)
  531. // add aggregate mean
  532. for name := range rawMetrics.Histograms {
  533. ms.histogramCountMu.RLock()
  534. aggCountPtr, countPresent :=
  535. ms.histogramCountStore[fmt.Sprintf("%s_count", name)]
  536. aggCount := atomic.LoadUint64(aggCountPtr)
  537. aggSumPtr, sumPresent :=
  538. ms.histogramCountStore[fmt.Sprintf("%s_sum", name)]
  539. aggSum := atomic.LoadUint64(aggSumPtr)
  540. ms.histogramCountMu.RUnlock()
  541. if countPresent && sumPresent && aggCount > 0 {
  542. processedMetrics.Metrics[fmt.Sprintf("%s_agg_avg", name)] =
  543. float64(aggSum / aggCount)
  544. processedMetrics.Metrics[fmt.Sprintf("%s_agg_count", name)] =
  545. float64(aggCount)
  546. processedMetrics.Metrics[fmt.Sprintf("%s_agg_sum", name)] =
  547. float64(aggSum)
  548. }
  549. }
  550. // broadcast processed metrics
  551. ms.subscribersMu.Lock()
  552. for subscriber := range ms.processedSubscribers {
  553. select {
  554. case subscriber <- processedMetrics:
  555. delete(ms.processedBadSubscribers, subscriber)
  556. default:
  557. ms.processedBadSubscribers[subscriber]++
  558. glog.Error("a subscriber has allowed their channel to fill up. ",
  559. "dropping their metrics on the floor rather than blocking.")
  560. if ms.processedBadSubscribers[subscriber] >= 2 {
  561. glog.Error("this subscriber has caused dropped metrics at ",
  562. "least 3 times in a row. closing the channel.")
  563. delete(ms.processedSubscribers, subscriber)
  564. close(subscriber)
  565. }
  566. }
  567. }
  568. ms.subscribersMu.Unlock()
  569. }
  570. select {
  571. case processChan <- sendProcessed:
  572. default:
  573. // processChan has filled up, this metric load is not sustainable
  574. glog.Errorf("processing of metrics is taking longer than this node can "+
  575. "handle. dropping this entire interval of %s metrics on the "+
  576. "floor rather than blocking the reaper.", rawMetrics.Time)
  577. }
  578. } // end main reaper loop
  579. }
  580. // Start spawns a goroutine for merging metrics into caches from
  581. // metric submitters, and a reaper goroutine that harvests metrics at the
  582. // default interval of every 60 seconds.
  583. func (ms *MetricSystem) Start() {
  584. if !ms.reaping {
  585. go ms.reaper()
  586. }
  587. }
  588. // Stop shuts down a MetricSystem
  589. func (ms *MetricSystem) Stop() {
  590. close(ms.shutdownChan)
  591. }