Apache log files
Naar navigatie springen
Naar zoeken springen
To see Apache log files real-time:
sudo tail -f /var/log/apache2/access.log
To parse:
#!/bin/bash # # Parse Apache log files to identify bots ################################################################################ # # # Accessing the log files ######################################## # # * Copy them to my home dir on the server using "sudo cp access* /home/jeroen" # or something # * Change attributes # * Use Nemo to copy them to my laptop - I don't want to experiment with this # kind of stuff on a server # # # Fields ######################################## # # * $1: Domain:port # * $2: IP address # * $3: "-" # * $4: "-" # * $5: Date & time # * $6: Time zone # * $7: Get, Post, Head # * $8: URL/source # * $9: Protocol (HTTP/1.1) # * $10: HTML status code (200, etc.) # * $11: Number of bytes? # * $12: "-" # * $13-$30: User agent string, depending on the number of spaces in this # string # # # Some filtering ######################################## # # awk '{print $1, $2, $NF}' access.log # awk '{print $1, $2, $3, $4, $5}' access.log # awk '{print $13, $14, $15, $16, $17, $18, $19, $20}' access.log # # awk -F'"' '{print $1, $NF}' access.log # # awk -F' ' '{print $1, $4, $12}' access.log | awk -F'"' '{print $1, $2}' # # awk -F' ' '{print $1, $2, $13, $14, $15, $16, $17, $18, $19, $20}' access.log | awk -F'"' '{print $1, $2}' # # awk '{print $1, $2, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25}' access.log # # awk -v OFS='\t' '{print $1, $2, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25}' access.log | awk -F'"' '{print $1, $2}' # # awk '{print $1, "\t", $2, "\t", $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30}' access.log > parsed-access.log # # cat access.log access.log.* | awk '{print $1, "\t", $2, "\t", $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30}' access.log > parsed-access.log # # cat access.log access.log.* | \ # awk '{print $1, "\t", $2, "\t", $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30}' \ # access.log > parsed-access.log # # # Store awk arguments in strings ######################################## # awk_arg_01='{print $1, "\t", $2, "\t", $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30}' awk_arg_02='{print $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30}' awk_arg=$awk_arg_01 # Parse all log files at once ######################################## # # This works, but takes too long while developing: # # cat access.log access.log.* | awk "$awk_arg" > parsed-access.log # # Filter irrelevant lines out ######################################## # # * Original length parsed-access.log: 73.709 lines # * Filtering for "GoogleBot": 66.834 lines left # * Filtering for "GoogleBot|Rocket": 54.445 lines left # # cat access.log | awk "$awk_arg" > parsed-access.log # cat access.log | grep -vE "Googlebot|Rocket" | awk "$awk_arg" > parsed-access.log # Store exclusion strings in strings ######################################## # s01="ahrefs|bingbot|Google|Rocket|bingbot|WordPress/6.3.1" s02="Chrome/116.0.0.0 Mobile Safari/537.36" # Doesn't work s=$01$02 # End result (for now) ######################################## # # * Only load the current log # * Remove lines with known agents # * Tab-separated columns # * Remaining problem: Lines with a lot of whitespace # cat access.log | grep -vE "$s01" | awk "$awk_arg" > parsed-access.log