====== Сбор метрик и хранение в InfluxDB ====== ===== telegraf ===== Агент сервера с открытым исходным кодом, который поможет вам собирать метрики из ваших стеков, датчиков и систем.\\ ==== Установка ==== Скачиваем дистр и ставим **dpkg -i** wget https://repos.influxdata.com/debian/pool/stable/t/telegraf/telegraf_1.19.1-1_amd64.deb # Debian wget https://repos.influxdata.com/centos/9/amd64/stable/telegraf-1.22.4-1.x86_64.rpm # CentOS # Либо есть в тех же репозиториях что и influxdb cat < ==== Настройка ==== Конфиг находится в **/etc/telegraf/telegraf.config** Командой можно генерировать конфигурацию: # telegraf -sample-config > telegraf.conf В данном случае будет дефолтная конфа, со всеми доступными метриками (плагинами)\\ Можно ограничивать аргументами: # telegraf -sample-config --input-filter <плагины сбора метрик через ":"> --output-filter <плагины передачи данных с метрик через ":"> > telegraf.conf на странице https://docs.influxdata.com/telegraf/v1.19/plugins/ можно найти список всех доступных плагинов. Пример: # telegraf -sample-config --input-filter disk:diskio:hddtemp --output-filter influxdb > telegraf.conf Проверка конфигурации: # telegraf --test && systemctl restart telegraf Далее проверяем наличие данных в БД: influx > show databases > use telegraf > show measurements > SELECT * FROM diskio ORDER BY time DESC LIMIT 15 # и т.д.
:!: Пример конфига telegraf -test - проверка конфигурации [global_tags] [agent] interval = "60s" round_interval = true metric_buffer_limit = 1000 flush_buffer_when_full = true collection_jitter = "0s" flush_interval = "10s" flush_jitter = "0s" ## Logging configuration: debug = false quiet = false logfile = "C:/Program Files/Telegraf/telegraf.log" hostname = "" ############################################################################### # OUTPUTS # ############################################################################### [[outputs.influxdb]] urls = ["http://:8086"] # required database = "telegraf" # required precision = "s" retention_policy = "" timeout = "10s" username = "" password = "" ############################################################################### # INPUTS # ############################################################################### [[inputs.win_perf_counters]] [[inputs.win_perf_counters.object]] # Processor usage, alternative to native, reports on a per core. ObjectName = "Processor" Instances = ["*"] Counters = [ "% Idle Time", "% Interrupt Time", "% Privileged Time", "% User Time", "% Processor Time", "% DPC Time", ] Measurement = "win_cpu" # Set to true to include _Total instance when querying for all (*). IncludeTotal=true [[inputs.win_perf_counters.object]] # Disk times and queues ObjectName = "LogicalDisk" Instances = ["*"] Counters = [ "% Idle Time", "% Disk Time", "% Disk Read Time", "% Disk Write Time", "Current Disk Queue Length", "% Free Space", "Free Megabytes", ] Measurement = "win_disk" # Set to true to include _Total instance when querying for all (*). #IncludeTotal=false [[inputs.win_perf_counters.object]] ObjectName = "PhysicalDisk" Instances = ["*"] Counters = [ "Disk Read Bytes/sec", "Disk Write Bytes/sec", "Current Disk Queue Length", "Disk Reads/sec", "Disk Writes/sec", "% Disk Time", "% Disk Read Time", "% Disk Write Time", ] Measurement = "win_diskio" [[inputs.win_perf_counters.object]] ObjectName = "Network Interface" Instances = ["*"] Counters = [ "Bytes Received/sec", "Bytes Sent/sec", "Packets Received/sec", "Packets Sent/sec", "Packets Received Discarded", "Packets Outbound Discarded", "Packets Received Errors", "Packets Outbound Errors", ] Measurement = "win_net" [[inputs.win_perf_counters.object]] ObjectName = "System" Counters = [ "Context Switches/sec", "System Calls/sec", "Processor Queue Length", "System Up Time", ] Instances = ["------"] Measurement = "win_system" # Set to true to include _Total instance when querying for all (*). #IncludeTotal=false [[inputs.win_perf_counters.object]] # Example query where the Instance portion must be removed to get data back, # such as from the Memory object. ObjectName = "Memory" Counters = [ "Available Bytes", "Cache Faults/sec", "Demand Zero Faults/sec", "Page Faults/sec", "Pages/sec", "Transition Faults/sec", "Pool Nonpaged Bytes", "Pool Paged Bytes", "Standby Cache Reserve Bytes", "Standby Cache Normal Priority Bytes", "Standby Cache Core Bytes", ] # Use 6 x - to remove the Instance bit from the query. Instances = ["------"] Measurement = "win_mem" # Set to true to include _Total instance when querying for all (*). #IncludeTotal=false [[inputs.win_perf_counters.object]] # Example query where the Instance portion must be removed to get data back, # such as from the Paging File object. ObjectName = "Paging File" Counters = [ "% Usage", ] Instances = ["_Total"] Measurement = "win_swap" [[inputs.win_perf_counters.object]] ObjectName = "Network Interface" Instances = ["*"] Counters = [ "Bytes Sent/sec", "Bytes Received/sec", "Packets Sent/sec", "Packets Received/sec", "Packets Received Discarded", "Packets Received Errors", "Packets Outbound Discarded", "Packets Outbound Errors", ] [[inputs.win_perf_counters.object]] # Process metrics, in this case for IIS only ObjectName = "Process" Counters = ["% Processor Time","Handle Count","Private Bytes","Thread Count","Virtual Bytes","Working Set","wmiApSrv"] Instances = ["telegraf"] Measurement = "win_proc" #IncludeTotal=false #Set to true to include _Total instance when querying for all (*). #[[inputs.win_services]] # ## Names of the services to monitor. Leave empty to monitor all the available services on the host # service_names = ["wmiApSrv"] #[[inputs.ping]] # urls = ["10.10.10.250"] # # Read metrics about cpu usage #[[inputs.cpu]] # ## Whether to report per-cpu stats or not # percpu = true # ## Whether to report total system cpu stats or not # totalcpu = true # ## Comment this line if you want the raw CPU time metrics # fielddrop = ["time_*"] # # Read metrics about disk usage by mount point [[inputs.disk]] # ## By default, telegraf gather stats for all mountpoints. # ## Setting mountpoints will restrict the stats to the specified mountpoints. # ## mount_points=["/"] # # ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually # ## present on /run, /var/run, /dev/shm or /dev). # # ignore_fs = ["tmpfs", "devtmpfs"] # # Read metrics about disk IO by device [[inputs.diskio]] # ## By default, telegraf will gather stats for all devices including # ## disk partitions. # ## Setting devices will restrict the stats to the specified devices. # ## devices = ["sda", "sdb"] # ## Uncomment the following line if you do not need disk serial numbers. # ## skip_serial_number = true # # Read metrics about memory usage [[inputs.mem]] # # no configuration # # Read metrics about swap memory usage [[inputs.swap]] # # no configuration [[inputs.exec]] commands = ["C:/test.bat"] timeout = "30s" # name_suffix = "Win_Run" # #name_override = "telegraf" data_format = "influx"
==== Мониторинг ZFS ====
:!: Плагин для ZFS [[https://github.com/influxdata/telegraf/blob/master/plugins/inputs/zfs/README.md|Doc]]\\ Добавялем в конфиг, собсна достаточно с параметрами по умолчанию\\ Данные берутся из "/proc/spl/kstat/zfs"\\ [[inputs.zfs]]
:!: Публикация в Prometheus [[https://github.com/influxdata/telegraf/blob/master/plugins/outputs/prometheus_client/README.md|Doc]]\\ По некоторым причинам удобнее метрики передавать (настраивать в графане) через прометеус\\ Есть модуль который публикует содержимое телеграфа в формате прометеуса, в веб-страницу\\ Добавляем блок "output" в конфиг:\\ [[outputs.prometheus_client]] listen = ":9273" string_as_label = true metric_version = 2