← Back to Skills

SysOps

ops

System administration, Linux operations, shell scripting, and infrastructure management best practices.

SysOps

Guidelines for system administration, shell scripting, and infrastructure management.

When to Activate

Shell Scripting

Script template

#!/usr/bin/env bash
set -euo pipefail
IFS=$'\n\t'

# Description: What this script does
# Usage: ./script.sh <arg1> <arg2>

readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly LOG_FILE="/var/log/$(basename "$0" .sh).log"

log() {
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
}

error() {
    log "ERROR: $*" >&2
    exit 1
}

cleanup() {
    log "Cleaning up..."
    # Remove temp files, etc.
}
trap cleanup EXIT

main() {
    log "Starting script..."
    # Your logic here
    log "Script completed successfully"
}

main "$@"

Input validation

# Require arguments
if [[ $# -lt 2 ]]; then
    echo "Usage: $0 <source> <destination>"
    exit 1
fi

# Validate file exists
if [[ ! -f "$1" ]]; then
    error "Source file does not exist: $1"
fi

# Validate directory
if [[ ! -d "$2" ]]; then
    error "Destination directory does not exist: $2"
fi

Safe variable handling

# GOOD - quote variables
cp "$source" "$destination"
rm -rf "${temp_dir:?}/"

# BAD - unquoted (word splitting issues)
cp $source $destination
rm -rf $temp_dir/  # Dangerous!

# Default values
name="${1:-default_value}"
config_file="${CONFIG_FILE:-/etc/app/config.yaml}"

Systemd Services

Service unit file

# /etc/systemd/system/myapp.service
[Unit]
Description=My Application
Documentation=https://docs.example.com
After=network.target
Wants=network-online.target

[Service]
Type=simple
User=appuser
Group=appgroup
WorkingDirectory=/opt/myapp

Environment=NODE_ENV=production
EnvironmentFile=/etc/myapp/env

ExecStart=/usr/bin/node /opt/myapp/server.js
ExecReload=/bin/kill -HUP $MAINPID

Restart=always
RestartSec=5
StartLimitIntervalSec=60
StartLimitBurst=3

# Security hardening
NoNewPrivileges=yes
ProtectSystem=strict
ProtectHome=yes
PrivateTmp=yes
ReadWritePaths=/var/lib/myapp

[Install]
WantedBy=multi-user.target

Common systemctl commands

# Manage services
sudo systemctl start myapp
sudo systemctl stop myapp
sudo systemctl restart myapp
sudo systemctl reload myapp

# Enable/disable auto-start
sudo systemctl enable myapp
sudo systemctl disable myapp

# Check status
systemctl status myapp
journalctl -u myapp -f

Log Management

Logrotate configuration

# /etc/logrotate.d/myapp
/var/log/myapp/*.log {
    daily
    rotate 14
    compress
    delaycompress
    missingok
    notifempty
    create 0640 appuser appgroup
    sharedscripts
    postrotate
        systemctl reload myapp > /dev/null 2>&1 || true
    endscript
}

Log analysis

# Find errors in logs
grep -E "ERROR|FATAL" /var/log/myapp/*.log

# Count requests per status code
awk '{print $9}' /var/log/nginx/access.log | sort | uniq -c | sort -rn

# Find slow requests (>1s)
awk '$NF > 1.0 {print}' /var/log/nginx/access.log

# Real-time monitoring
tail -f /var/log/myapp/app.log | grep --line-buffered ERROR

Performance Monitoring

System metrics

# CPU usage
top -b -n 1 | head -20
mpstat -P ALL 1 5

# Memory usage
free -h
vmstat 1 10

# Disk I/O
iostat -x 1 5
iotop -o

# Network
ss -tuln
nethogs
iftop

Process investigation

# Find process by port
lsof -i :3000
ss -tlnp | grep 3000

# Process details
ps aux | grep myapp
pstree -p $(pgrep myapp)

# Open files by process
lsof -p $(pgrep myapp)

# System calls
strace -p $(pgrep myapp) -f -e trace=network

Disk Management

Check disk space

# Human-readable disk usage
df -h

# Find large files
find /var -type f -size +100M -exec ls -lh {} \;

# Directory sizes
du -sh /var/* | sort -rh | head -20

# Disk usage by filetype
find /var -type f -name "*.log" -exec du -ch {} + | tail -1

Safe cleanup

# Clear old logs (keep 7 days)
find /var/log -name "*.log" -mtime +7 -delete

# Clear old temp files
find /tmp -type f -atime +3 -delete

# Clear package cache (Debian/Ubuntu)
apt-get clean
apt-get autoremove

# Clear journal logs older than 7 days
journalctl --vacuum-time=7d

Cron Jobs

Crontab best practices

# /etc/cron.d/myapp

# Always set PATH
PATH=/usr/local/bin:/usr/bin:/bin

# Always set MAILTO
MAILTO=ops@example.com

# Use descriptive comments
# Backup database daily at 2 AM
0 2 * * * appuser /opt/myapp/scripts/backup.sh >> /var/log/myapp/backup.log 2>&1

# Health check every 5 minutes
*/5 * * * * appuser /opt/myapp/scripts/healthcheck.sh > /dev/null 2>&1

# Cleanup weekly on Sunday at 3 AM
0 3 * * 0 appuser /opt/myapp/scripts/cleanup.sh >> /var/log/myapp/cleanup.log 2>&1

Lock to prevent overlap

#!/usr/bin/env bash
# Use flock to prevent concurrent runs
exec 200>/var/lock/myapp-backup.lock
flock -n 200 || { echo "Already running"; exit 1; }

# Your backup logic here

SSH Configuration

Secure SSH config

# /etc/ssh/sshd_config
Port 22
Protocol 2
PermitRootLogin no
PasswordAuthentication no
PubkeyAuthentication yes
AuthorizedKeysFile .ssh/authorized_keys
PermitEmptyPasswords no
ChallengeResponseAuthentication no
UsePAM yes
X11Forwarding no
PrintMotd no
AcceptEnv LANG LC_*
Subsystem sftp /usr/lib/openssh/sftp-server
MaxAuthTries 3
LoginGraceTime 20
AllowUsers deploy@* admin@192.168.1.*

SSH client config

# ~/.ssh/config
Host production
    HostName prod.example.com
    User deploy
    IdentityFile ~/.ssh/prod_key
    ForwardAgent no

Host staging
    HostName staging.example.com
    User deploy
    IdentityFile ~/.ssh/staging_key
    
Host *
    ServerAliveInterval 60
    ServerAliveCountMax 3
    AddKeysToAgent yes

Firewall

UFW (Ubuntu)

# Default policies
ufw default deny incoming
ufw default allow outgoing

# Allow specific services
ufw allow ssh
ufw allow 80/tcp
ufw allow 443/tcp

# Allow from specific IP
ufw allow from 192.168.1.0/24 to any port 5432

# Enable firewall
ufw enable
ufw status verbose