Extracts from awstats logs, bandwidth used by various visitors to this site:
| crawler.bloglines.com | 353.02 MB |
| MSIECrawler | 567.41 MB |
| Inktomi Slurp | 135.79 MB |
| Googlebot | 59.86 MB |
Apparently MSIECrawler is IE sucking the entire contents of the site. A couple of people seem to have done this, what is the point? Is this site that interesting? Are the spam blogs (copies of legitimate blogs full of links to p0ker sites) using IE for their scraping technology? My attitude to the spammers turns from annoyance to pity.
Bloglines is getting a bit carried away, 353M just downloading RSS feeds.
InkTomi Slurp is still slurping and not returning any visitors from search results.
Googlebot drives 95% of my traffic so 59M is acceptable.
Here is my latest crack at apache log file analysis in python:
1 # 2 # Apache log file analysis. 3 # 4 import re 5 import datetime 6 7 # 8 # Regular expression for parsing apache log file. 9 # 10 oLogRE = re.compile( r'''([\d.]+).*\s+ # host 11 [^\s]+\s+ # ? 12 [^\s]+\s+ # ? 13 \[(.*?)\]\s+ # when 14 "(.*?)\s+(.*)\s+(.*)"\s+ # method, path, protocol 15 (\d+)\s+ # Error code 16 ([^\s]+)\s+ # Size ? 17 "(.*?)"\s+ # Referrer 18 "(.*?)" # Agent 19 ''', re.VERBOSE) 20 21 LOG_Who = 0 22 LOG_When = 1 23 LOG_How = 2 24 LOG_What = 3 25 LOG_Protocol = 4 26 LOG_Error = 5 27 LOG_Size = 6 28 LOG_Referrer = 7 29 LOG_Agent = 8 30 31 def ScanFile( strFile): 32 """ 33 Scan apache log file and return hits. 34 """ 35 for strLine in open( 'c:\\Desktop\\access.log').readlines(): 36 oMatch = oLogRE.match( strLine) 37 if oMatch: 38 yield( oMatch.groups()) 39 else: 40 print 'Reject: %s' % strLine 41 42 def GatherBy( oHits, nField): 43 """ 44 Gather hits from list of hits into a dictionary keyed 45 by unique values of a specific field. 46 """ 47 oDict = {} 48 49 for oHit in oHits: 50 oKey = oHit[nField] 51 if oKey in oDict: 52 oDict[oKey].append( oHit) 53 else: 54 oDict[oKey] = [oHit] 55 56 return oDict 57 58 def FilterBy( oHits, nField, strFilter): 59 """ 60 Filter hits from list of hits by unique values of a specific field. 61 """ 62 oRE = re.compile( strFilter) 63 64 for oHit in oHits: 65 if oRE.search( oHit[nField]): 66 yield( oHit) 67 68 def FilterByDate( oHits, 69 oStartDate, 70 oEndDate = datetime.date.today() + datetime.timedelta(1)): 71 """ 72 Filter hits >= Start Date and < End Date 73 """ 74 oRE = re.compile( r'(\d+)/(\w+)/(\d+).*') 75 76 for oHit in oHits: 77 strDate = oHit[LOG_When] 78 strDay, strMonth, strYear = oRE.match( strDate).groups() 79 80 nDay = int( strDay) 81 nMonth = ['Jan', 'Feb', 'Mar', 82 'Apr', 'May', 'Jun', 83 'Jul', 'Aug', 'Sep', 84 'Oct', 'Nov', 'Dec'].index( strMonth) + 1 85 nYear = int( strYear) 86 87 oDate = datetime.date( nYear, nMonth, nDay) 88 89 if oDate >= oStartDate and oDate < oEndDate: 90 yield( oHit) 91 92 def AnalyseBy( oHits, nField, bJustSummary = False): 93 """ 94 Print hits by unique values of a specific field 95 and generate counts and bytes for each unique value. 96 """ 97 oDict = GatherBy( oHits, nField) 98 99 oKeys = oDict.keys() 100 101 oKeys.sort() 102 103 nGrandTotalCounts = 0 104 nGrandTotalBytes = 0 105 106 for oKey in oKeys: 107 nCount = len( oDict[oKey]) 108 109 nTotal = 0 110 111 for oHit in oDict[oKey]: 112 strSize = oHit[LOG_Size] 113 if strSize != '-': 114 nTotal += int(strSize) 115 116 if not bJustSummary: 117 print oKey, nCount, nTotal 118 119 nGrandTotalCounts += nCount 120 nGrandTotalBytes += nTotal 121 122 print "Unique items: %d, Total Hits: %d, Total Bytes: %d" % (len(oKeys), 123 nGrandTotalCounts, 124 nGrandTotalBytes) 125 126 oStartDate = datetime.date.today() - datetime.timedelta( 8 ) # week yesterday 127 oEndDate = datetime.date.today() - datetime.timedelta( 1 ) # yesterday 128 129 oAllHits = list( FilterByDate( ScanFile( 'c:\\Desktop\\access.log'), 130 oStartDate, oEndDate)) 131 oAllHits.extend( list( FilterByDate( ScanFile( 'c:\\Desktop\\access.log.1'), 132 oStartDate, oEndDate))) 133 134 print "User Agents" 135 AnalyseBy( oAllHits, LOG_Agent, True) 136 137 print "All Hosts (hence all usage)" 138 AnalyseBy( oAllHits, LOG_Who, True) 139 140 print "Hits from bloglines" 141 # 142 # Determine different bloglines feeds and analyse each one 143 # 144 for strFeed, oFeedHits in GatherBy( FilterBy( oAllHits, 145 LOG_Agent, 146 'Bloglines'), 147 LOG_What).items(): 148 # 149 # Now analyse by agent: agent includes number of subscribers 150 # so we see subscribers per feed. 151 print "Bloglines feed %s" % strFeed 152 AnalyseBy( oFeedHits, LOG_Agent) 153 154 print "Hits from MSIECrawler by host" 155 AnalyseBy( FilterBy( oAllHits, LOG_Agent, 'MSIECrawler'), LOG_Who) 156 157 print "Hits from Inktomi/yahoo slurp" 158 AnalyseBy( FilterBy( oAllHits, LOG_Agent, 'Slurp'), LOG_Agent)
Example output for week ending yesterday. Notes:
- 659,628,604 bytes served in a week!
- Slurp took 45,786,926 bytes
- MSIECrawl user took 49,980,154 bytes
- I've got more bloglines subscribers than I thought.
- Just how many rss feed urls does drupal provide?
User Agents Unique items: 497, Total Hits: 76052, Total Bytes: 659628604 All Hosts (hence all usage) Unique items: 3301, Total Hits: 76052, Total Bytes: 659628604 Hits from bloglines Bloglines feed /blog/1/feed Bloglines/3.0-rho (http://www.bloglines.com; 1 subscriber) 252 13730570 Bloglines/3.0-rho (http://www.bloglines.com; 3 subscribers) 256 13947306 Bloglines/3.0-rho (http://www.bloglines.com; 5 subscribers) 252 13730570 Bloglines/3.0-rho (http://www.bloglines.com; 7 subscribers) 256 13947306 Unique items: 4, Total Hits: 1016, Total Bytes: 55355752 Bloglines feed /blog/feed Bloglines/3.0-rho (http://www.bloglines.com; 1 subscriber) 242 13189698 Unique items: 1, Total Hits: 242, Total Bytes: 13189698 Bloglines feed /atom/feed Bloglines/3.0-rho (http://www.bloglines.com; 3 subscribers) 256 1449434 Unique items: 1, Total Hits: 256, Total Bytes: 1449434 Bloglines feed /tags/18/feed Bloglines/3.0-rho (http://www.bloglines.com; 1 subscriber) 256 12427520 Unique items: 1, Total Hits: 256, Total Bytes: 12427520 Bloglines feed /blog/feed/1 Bloglines/3.0-rho (http://www.bloglines.com; 5 subscribers) 252 0 Unique items: 1, Total Hits: 252, Total Bytes: 0 Bloglines feed /taxonomy/term/5/0/feed Bloglines/3.0-rho (http://www.bloglines.com; 1 subscriber) 242 3832796 Unique items: 1, Total Hits: 242, Total Bytes: 3832796 Bloglines feed /node/feed Bloglines/3.0-rho (http://www.bloglines.com; 3 subscribers) 256 13959338 Unique items: 1, Total Hits: 256, Total Bytes: 13959338 Bloglines feed /rss.xml Bloglines/3.0-rho (http://www.bloglines.com; 3 subscribers) 256 0 Bloglines/3.0-rho (http://www.bloglines.com; 7 subscribers) 256 0 Unique items: 2, Total Hits: 512, Total Bytes: 0 Bloglines feed /tags/3/feed Bloglines/3.0-rho (http://www.bloglines.com; 1 subscriber) 250 12999000 Unique items: 1, Total Hits: 250, Total Bytes: 12999000 Hits from MSIECrawler by host 81.159.46.223 1784 49980154 Unique items: 1, Total Hits: 1784, Total Bytes: 49980154 Hits from Inktomi/yahoo slurp Mozilla/5.0 (compatible; Yahoo! Slurp China;) 58 800642 Mozilla/5.0 (compatible; Yahoo! Slurp;) 2066 44986284 Unique items: 2, Total Hits: 2124, Total Bytes: 45786926

