Сбор метрик и хранение в InfluxDB

telegraf

Агент сервера с открытым исходным кодом, который поможет вам собирать метрики из ваших стеков, датчиков и систем.

Установка

Скачиваем дистр и ставим dpkg -i

wget https://repos.influxdata.com/debian/pool/stable/t/telegraf/telegraf_1.19.1-1_amd64.deb # Debian
wget https://repos.influxdata.com/centos/9/amd64/stable/telegraf-1.22.4-1.x86_64.rpm # CentOS
 
   # Либо есть в тех же репозиториях что и influxdb
cat <<EOF | sudo tee /etc/yum.repos.d/influxdata.repo
[influxdata]
name = InfluxData Repository - Stable
baseurl = https://repos.influxdata.com/stable/\$basearch/main
enabled = 1
gpgcheck = 1
gpgkey = https://repos.influxdata.com/influxdata-archive_compat.key
EOF
 
dnf install telegraf

Настройка

Конфиг находится в /etc/telegraf/telegraf.config

Командой можно генерировать конфигурацию:

# telegraf -sample-config > telegraf.conf

В данном случае будет дефолтная конфа, со всеми доступными метриками (плагинами)
Можно ограничивать аргументами:

# telegraf -sample-config --input-filter <плагины сбора метрик через ":"> --output-filter <плагины передачи данных с метрик через ":"> > telegraf.conf

на странице https://docs.influxdata.com/telegraf/v1.19/plugins/ можно найти список всех доступных плагинов.

Пример:

# telegraf -sample-config --input-filter disk:diskio:hddtemp --output-filter influxdb > telegraf.conf

Проверка конфигурации:

# telegraf --test && systemctl restart telegraf

Далее проверяем наличие данных в БД:

influx
> show databases
> use telegraf
> show measurements
> SELECT * FROM diskio ORDER BY time DESC LIMIT 15
# и т.д.

Пример конфига

telegraf -test - проверка конфигурации

[global_tags]
[agent]
  interval = "60s"
  round_interval = true
  metric_buffer_limit = 1000
  flush_buffer_when_full = true
  collection_jitter = "0s"
  flush_interval = "10s"
  flush_jitter = "0s"
 
  ## Logging configuration:
  debug = false
  quiet = false
  logfile = "C:/Program Files/Telegraf/telegraf.log"
 
  hostname = ""
 
 
###############################################################################
#                                  OUTPUTS                                    #
###############################################################################
 
[[outputs.influxdb]]
  urls = ["http://:8086"] # required
  database = "telegraf" # required
  precision = "s"
  retention_policy = ""
  timeout = "10s"
  username = ""
  password = ""
 
 
 
###############################################################################
#                                  INPUTS                                     #
###############################################################################
 
 
[[inputs.win_perf_counters]]
  [[inputs.win_perf_counters.object]]
    # Processor usage, alternative to native, reports on a per core.
    ObjectName = "Processor"
    Instances = ["*"]
    Counters = [
      "% Idle Time",
      "% Interrupt Time",
      "% Privileged Time",
      "% User Time",
      "% Processor Time",
      "% DPC Time",
    ]
    Measurement = "win_cpu"
    # Set to true to include _Total instance when querying for all (*).
    IncludeTotal=true
 
  [[inputs.win_perf_counters.object]]
    # Disk times and queues
    ObjectName = "LogicalDisk"
    Instances = ["*"]
    Counters = [
      "% Idle Time",
      "% Disk Time",
      "% Disk Read Time",
      "% Disk Write Time",
      "Current Disk Queue Length",
      "% Free Space",
      "Free Megabytes",
    ]
    Measurement = "win_disk"
    # Set to true to include _Total instance when querying for all (*).
    #IncludeTotal=false
 
  [[inputs.win_perf_counters.object]]
    ObjectName = "PhysicalDisk"
    Instances = ["*"]
    Counters = [
      "Disk Read Bytes/sec",
      "Disk Write Bytes/sec",
      "Current Disk Queue Length",
      "Disk Reads/sec",
      "Disk Writes/sec",
      "% Disk Time",
      "% Disk Read Time",
      "% Disk Write Time",
    ]
    Measurement = "win_diskio"
 
  [[inputs.win_perf_counters.object]]
    ObjectName = "Network Interface"
    Instances = ["*"]
    Counters = [
      "Bytes Received/sec",
      "Bytes Sent/sec",
      "Packets Received/sec",
      "Packets Sent/sec",
      "Packets Received Discarded",
      "Packets Outbound Discarded",
      "Packets Received Errors",
      "Packets Outbound Errors",
    ]
    Measurement = "win_net"
 
  [[inputs.win_perf_counters.object]]
    ObjectName = "System"
    Counters = [
      "Context Switches/sec",
      "System Calls/sec",
      "Processor Queue Length",
      "System Up Time",
    ]
    Instances = ["------"]
    Measurement = "win_system"
    # Set to true to include _Total instance when querying for all (*).
    #IncludeTotal=false
 
  [[inputs.win_perf_counters.object]]
    # Example query where the Instance portion must be removed to get data back,
    # such as from the Memory object.
    ObjectName = "Memory"
    Counters = [
      "Available Bytes",
      "Cache Faults/sec",
      "Demand Zero Faults/sec",
      "Page Faults/sec",
      "Pages/sec",
      "Transition Faults/sec",
      "Pool Nonpaged Bytes",
      "Pool Paged Bytes",
      "Standby Cache Reserve Bytes",
      "Standby Cache Normal Priority Bytes",
      "Standby Cache Core Bytes",
 
    ]
    # Use 6 x - to remove the Instance bit from the query.
    Instances = ["------"]
    Measurement = "win_mem"
    # Set to true to include _Total instance when querying for all (*).
    #IncludeTotal=false
 
  [[inputs.win_perf_counters.object]]
    # Example query where the Instance portion must be removed to get data back,
    # such as from the Paging File object.
    ObjectName = "Paging File"
    Counters = [
      "% Usage",
    ]
    Instances = ["_Total"]
    Measurement = "win_swap"
 
  [[inputs.win_perf_counters.object]]
    ObjectName = "Network Interface"
    Instances = ["*"]
    Counters = [
      "Bytes Sent/sec",
      "Bytes Received/sec",
      "Packets Sent/sec",
      "Packets Received/sec",
      "Packets Received Discarded",
      "Packets Received Errors",
      "Packets Outbound Discarded",
      "Packets Outbound Errors",
    ]
 
[[inputs.win_perf_counters.object]]
    # Process metrics, in this case for IIS only
    ObjectName = "Process"
    Counters = ["% Processor Time","Handle Count","Private Bytes","Thread Count","Virtual Bytes","Working Set","wmiApSrv"]
    Instances = ["telegraf"]
    Measurement = "win_proc"
    #IncludeTotal=false #Set to true to include _Total instance when querying for all (*).
 
#[[inputs.win_services]]
#  ## Names of the services to monitor. Leave empty to monitor all the available services on the host
#  service_names = ["wmiApSrv"]	
 
#[[inputs.ping]]
#  urls = ["10.10.10.250"]
 
# # Read metrics about cpu usage
#[[inputs.cpu]]
#   ## Whether to report per-cpu stats or not
#   percpu = true
#   ## Whether to report total system cpu stats or not
#   totalcpu = true
#   ## Comment this line if you want the raw CPU time metrics
#   fielddrop = ["time_*"]
 
 
# # Read metrics about disk usage by mount point
[[inputs.disk]]
#   ## By default, telegraf gather stats for all mountpoints.
#   ## Setting mountpoints will restrict the stats to the specified mountpoints.
#   ## mount_points=["/"]
#
#   ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
#   ## present on /run, /var/run, /dev/shm or /dev).
#   # ignore_fs = ["tmpfs", "devtmpfs"]
 
 
# # Read metrics about disk IO by device
[[inputs.diskio]]
#   ## By default, telegraf will gather stats for all devices including
#   ## disk partitions.
#   ## Setting devices will restrict the stats to the specified devices.
#   ## devices = ["sda", "sdb"]
#   ## Uncomment the following line if you do not need disk serial numbers.
#   ## skip_serial_number = true
 
 
# # Read metrics about memory usage
[[inputs.mem]]
#   # no configuration
 
 
# # Read metrics about swap memory usage
[[inputs.swap]]
#   # no configuration
 
 
 
[[inputs.exec]]  
	commands = ["C:/test.bat"]
  timeout = "30s"
# name_suffix = "Win_Run"
#  #name_override = "telegraf"
	data_format = "influx"

Мониторинг ZFS

Плагин для ZFS

Doc

Добавялем в конфиг, собсна достаточно с параметрами по умолчанию
Данные берутся из «/proc/spl/kstat/zfs»

[[inputs.zfs]]

Публикация в Prometheus

Doc
По некоторым причинам удобнее метрики передавать (настраивать в графане) через прометеус
Есть модуль который публикует содержимое телеграфа в формате прометеуса, в веб-страницу
Добавляем блок «output» в конфиг:

[[outputs.prometheus_client]]
  listen = ":9273"
  string_as_label = true
  metric_version = 2

zxcx

Инструменты пользователя

Инструменты сайта

Содержание

Сбор метрик и хранение в InfluxDB

telegraf

Установка

Настройка

Мониторинг ZFS

Инструменты страницы