Apache log files

Uit De Vliegende Brigade
Naar navigatie springen Naar zoeken springen

To see Apache log files real-time:

sudo tail -f /var/log/apache2/access.log

To parse:

#!/bin/bash
#
# Parse Apache log files to identify bots
################################################################################
#
#
# Accessing the log files
########################################
#
# * Copy them to my home dir on the server using "sudo cp access* /home/jeroen"
#   or something
# * Change attributes
# * Use Nemo to copy them to my laptop - I don't want to experiment with this
#   kind of stuff on a server
#
#
# Fields
########################################
#
# * $1: Domain:port
# * $2: IP address
# * $3: "-"
# * $4: "-"
# * $5: Date & time
# * $6: Time zone
# * $7: Get, Post, Head
# * $8: URL/source
# * $9: Protocol (HTTP/1.1)
# * $10: HTML status code (200, etc.)
# * $11: Number of bytes?
# * $12: "-"
# * $13-$30: User agent string, depending on the number of spaces in this
#   string
# 
#
# Some filtering
########################################
#
# awk '{print $1, $2, $NF}' access.log
# awk '{print $1, $2, $3, $4, $5}' access.log
# awk '{print $13, $14, $15, $16, $17, $18, $19, $20}' access.log
#
# awk -F'"' '{print $1, $NF}' access.log
#
# awk -F' ' '{print $1, $4, $12}' access.log | awk -F'"' '{print $1, $2}'
#
# awk -F' ' '{print $1, $2, $13, $14, $15, $16, $17, $18, $19, $20}' access.log | awk -F'"' '{print $1, $2}'
#
# awk '{print $1, $2, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25}' access.log
#
# awk -v OFS='\t' '{print $1, $2, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25}' access.log | awk -F'"' '{print $1, $2}'
#
# awk '{print $1, "\t", $2, "\t", $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30}' access.log > parsed-access.log
# 
# cat access.log access.log.* | awk '{print $1, "\t", $2, "\t", $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30}' access.log > parsed-access.log
#
# cat access.log access.log.* | \
# 	awk '{print $1, "\t", $2, "\t", $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30}' \
# 	access.log > parsed-access.log
#
#
# Store awk arguments in strings
########################################
#
awk_arg_01='{print $1, "\t", $2, "\t", $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30}'
awk_arg_02='{print $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30}'
awk_arg=$awk_arg_01


# Parse all log files at once
########################################
#
# This works, but takes too long while developing:
#
# cat access.log access.log.* | awk "$awk_arg" > parsed-access.log
#

# Filter irrelevant lines out
########################################
#
# * Original length parsed-access.log: 	73.709 lines
# * Filtering for "GoogleBot": 			66.834 lines left
# * Filtering for "GoogleBot|Rocket": 	54.445 lines left
#
# cat access.log | awk "$awk_arg" > parsed-access.log
# cat access.log | grep -vE "Googlebot|Rocket" | awk "$awk_arg" > parsed-access.log


# Store exclusion strings in strings
########################################
#
s01="ahrefs|bingbot|Google|Rocket|bingbot|WordPress/6.3.1"
s02="Chrome/116.0.0.0 Mobile Safari/537.36"   # Doesn't work
s=$01$02


# End result (for now)
########################################
#
# * Only load the current log
# * Remove lines with known agents
# * Tab-separated columns
# * Remaining problem: Lines with a lot of whitespace
#
cat access.log | grep -vE "$s01"  | awk "$awk_arg" > parsed-access.log