#!/usr/bin/env python """This program generates a histogram of the input data. This is intended to replace a complex pipeline like the following awk -F\" '{print $2}' /var/log/httpd-access.log | \ sort | uniq -c | sort -nr | head -20 ...with a single process, like so: histogram.py -F\" -f 2 -n 20 /var/log/httpd-access.log """ __author__ = 'Chuck Swiger ' __copyright__ = 'Copyright (c) 2005-2006 Charles Swiger' __version__ = '$Id: histogram.py,v 1.6 2006/05/06 15:47:51 chuck Exp $' __license__ = 'This software is licenced under the same terms as Python. (See the PSF license at www.python.org/license.html).' # add program version info versionNumber = '1.1' versionInfo = '$Revision: 1.6 $' import os,sys,getopt,signal config = {} def usage(msg=None, exitCode=os.EX_USAGE): """Displays program usage message and help, then exits.""" if msg: sys.stderr.write('\nERROR: %s\n\n' % msg) # watch out on the quoting of backslashes below sys.stderr.write("""Usage: histogram.py [options] data_file [ data_file2... ] Where [options] may include any of the following flags: -b N --byte Start after byte offset N. "-b 0" means the whole line. -B N --byte-field Start field after N bytes. -c N --count Only display lines which appeared at least N times. -d str --display Display output as "count" (default) or "fract". -f N --field Use field #N only, default ("-f 0") means whole line. Also, N1,N2 "%1s %2s" one can provide a comma-seperated list, or use a string. -F FS --field-separator Use FS as the field seperator, rather than whitespace. Note that $FS can also be set in the environment (see awk). -h --help Display this help message. -i file --input Read data from file specified, rather than stdin. (NOOP) -l --lower Lower-cases the input to match case-insensitive. -n N --number Generate N lines of output and then exit. -o file --output Write data to file specified, rather than stdout. -p PAT --pattern Consider only lines which match string PAT. (no regex yet) -P PAT --pattern-field Consider only lines if selected field matches PAT. -r --reverse Generate the output sorted from least to most common. -s N --size Limit the size of the key to N characters. -q str --quote Quote the input text using STR in the generated output. -v --verbose Enable verbosity and display frequency list. -V --version Display the version of histogram.py and exit. -x PAT --exclude Exclude lines which match string PAT. (no regex yet) If no filename is specified the program will read stdin, and will write to stdout unless -o is used. '-' can be used as an alias for stdin or stdout. This program generates a histogram of the input data. This is intended to replace a complex pipeline like the following: awk -F\\" '{print $2}' /var/log/httpd-access.log | \\ sort | uniq -c | sort -nr | head -20 ...with a single process, like so: histogram.py -F\\" -f 2 -n 20 /var/log/httpd-access.log """) sys.exit(exitCode) def version(exitCode=os.EX_OK): """Displays program version info, then exits.""" sys.stderr.write("histogram.py version %s %s\n" % \ (versionNumber,versionInfo)) sys.exit(exitCode) # option parsing code def countOptOccurrences(oL, shortName, longName): """Counts the number of times an option or long-alias appears.""" count = 0 x = [shortName, longName] for o in oL: if o[0] in x: count = count + 1 return count def checkOpt(oD, shortName, longName, defaultValue=None, setValue=1): """ Scans oD for options with name shortName or longName. If found, returns the value. If not found, returns defaultValue. If found, but the value is '', returns the setValue. """ if oD.has_key(shortName): possibleValue = oD[shortName] elif oD.has_key(longName): possibleValue = oD[longName] else: return defaultValue if possibleValue == '': return setValue else: return possibleValue def parseOptions(): """This parses the command-line options sent to the program with getopt.""" optDict = {} try: optionList, args = getopt.getopt(sys.argv[1:], 'b:B:c:d:f:F:hln:o:p:P:q:rs:vVx:', ['help', 'verbose', 'display=', 'version', 'input=', 'output=', 'count=', 'field=', 'field-separator=', 'number=', 'lower', 'pattern=', 'size=', 'byte=', 'byte-field=', 'pattern-field=', 'exclude=']) except: exc_type, exc_value, exc_traceback = sys.exc_info() usage(exc_value) for k,v in optionList: optDict[k] = v # print 'optDict: %s' % optDict # print 'leftover args:', args if len(args) == 0: config['input'] = list('-') else: config['input'] = args # asking for help or the version terminates the program without running it if checkOpt(optDict, '-h', '--help', 0): usage(None, exitCode=os.EX_OK) if checkOpt(optDict, '-V', '--version', 0): version() # normal options config['byte'] = int(checkOpt(optDict, '-b', '--byte', -1)) config['bytefield'] = int(checkOpt(optDict, '-B', '--byte-field', 0)) config['count'] = int(checkOpt(optDict, '-c', '--count', 0)) config['display'] = checkOpt(optDict, '-d', '--display', 'count').lower() config['fieldlist'] = checkOpt(optDict, '-f', '--field', '0') if config['fieldlist'].find(',') > -1: fl = config['fieldlist'].split(',') config['fieldlist'] = map(lambda x: int(x) - 1, fl) elif config['fieldlist'].find('%') > -1: sys.stderr.write("FATAL: Format not supported yet.\n") sys.exit(os.EX_UNAVAILABLE) else: config['fieldlist'] = [ int(config['fieldlist']) - 1 ] # FS from awk if os.environ.has_key('FS'): fs_default = os.environ['FS'] else: fs_default = None config['fs'] = checkOpt(optDict, '-F', '--field-separator', fs_default) config['lower'] = checkOpt(optDict, '-l', '--lower', False, True) config['number'] = int(checkOpt(optDict, '-n', '--number', 0)) config['pattern'] = checkOpt(optDict, '-p', '--pattern', None) config['fieldpattern'] = checkOpt(optDict, '-P', '--pattern-field', None) config['quote'] = checkOpt(optDict, '-q', '--quote', None) config['reverse'] = checkOpt(optDict, '-r', '--reverse', True, False) config['size'] = int(checkOpt(optDict, '-s', '--size', 0)) config['verbose'] = checkOpt(optDict, '-v', '--verbose', None, setValue=countOptOccurrences(optionList, '-v', '--verbose')) config['exclude'] = checkOpt(optDict, '-x', '--exclude', None) # finally, let's try to open our output config['output'] = checkOpt(optDict, '-o', '--output', '-') if config['output'] == '-': config['output'] = sys.stdout else: try: config['output'] = open(config['output'], 'w') except IOError: sys.stderr.write("FATAL: histogram.py couldn't write to the output!\n") sys.exit(os.EX_CANTCREAT) def scan_file(filename): """Performs a histogram on the input file, using '-' to mean stdin.""" if filename == '-': fd = sys.stdin else: try: fd = open(filename, 'rU', 1) except IOError: sys.stderr.write("FATAL: histogram.py couldn't open: %s\n" % filename) sys.exit(os.EX_NOINPUT) # dict to track keys seen, line count (matching pattern), and lines read ld = config['linedict'] lc = config['lc'] lr = config['lr'] verbose = config['verbose'] byte = config['byte'] bytefield = config['bytefield'] maxsize = config['size'] field_no = config['fieldlist'][0] max_field = max(config['fieldlist']) extrafields = config['fieldlist'][1:] if config['quote']: q = '%s %s%%s' % (config['quote'], config['quote']) else: q = ' %s' # figure out these tests outside of the main loop, so I can switch to regex ipat = config['pattern'] xpat = config['exclude'] if ipat: if xpat: linetest = lambda l: (l.find(ipat) == -1) and (l.find(xpat) > -1) else: linetest = lambda l: (l.find(ipat) == -1) else: if xpat: linetest = lambda l: (l.find(xpat) > -1) else: linetest = lambda l: False for line in fd: lr += 1 # trim by the byte-offset, if set, otherwise take off the end-of-line if len(line) - 1 > byte: if byte > 0: line = line[byte:-1] else: line = line[:-1] else: if verbose: sys.stderr.write("# line shorter than %d chars, ignoring: %s" % (byte, line)) continue # check whether the line contains pattern if linetest(line): continue if field_no >= 0: wordlist = line.split(config['fs']) if len(wordlist) <= max_field: if verbose: sys.stderr.write("# field %d not found: %s\n" % \ (field_no + 1, line)) continue line = wordlist[field_no] if bytefield > 0: if len(line) > bytefield: line = line[bytefield:] else: if verbose: sys.stderr.write("# field shorter than %d chars, ignoring: %s\n" % (bytefield, line)) continue if config['fieldpattern'] and \ line.find(config['fieldpattern']) == -1: continue lc += 1 for f in extrafields: line += q % wordlist[f].strip() if maxsize and len(line) > maxsize: line = line[:maxsize] if config['lower']: line = line.lower() if ld.has_key(line): ld[line] = ld[line] + 1 else: ld[line] = 1 # close the file immediately before going off to others or to spend a lot # of time sorting and outputting the results of the histogram fd.close() # once we've read all of the lines in the file, update the global counters config['lc'] += lc config['lr'] += lr def main(): """This is the main entry point, parse the command-line options, then read the input, then generate the histogram.""" parseOptions() config['linedict'] = {} ld = config['linedict'] config['lc'] = 0 config['lr'] = 0 for file in config['input']: scan_file(file) # if we're running verbose, note the linecount, lines read, and # of files if config['verbose']: nf = len(config['input']) if nf > 1: config['output'].write("# linecount: %d / %d of %d files\n" % (config['lc'], config['lr'], nf)) else: config['output'].write("# linecount: %d / %d\n" % (config['lc'], config['lr'])) # lets see whether we read any lines at all if config['lr'] == 0: if config['verbose']: sys.stderr.write("# NOTICE: histogram read zero lines of input\n") sys.exit(os.EX_OK) # if we read text but didn't match any lines, exit noting the patterns used if config['lc'] == 0 and config['lr'] > 0: if config['verbose']: sys.stderr.write("# NOTICE: histogram found zero lines matching patterns:\n# %s %s\n" % (config['pattern'], config['fieldpattern'])) sys.exit(os.EX_DATAERR) # I was gonna write an insertion sort which discards duplicate keys, # but lets let Python do the work instead... freqlist = list(set(ld.values())) freqlist.sort(reverse=config['reverse']) if config['verbose']: config['output'].write("# frequency list: %s\n" % str(freqlist)) total = float(config['lc']) if config['display'][0] == 'f': # we're displaying line counts as a fraction of the total fieldwidth = 4 if config['quote']: format = "%%0.%df %s%%s%s\n" % \ (fieldwidth, config['quote'], config['quote']) else: format = "%%0.%df %%s\n" % fieldwidth else: # determine the format string used to write to the output # make sure to left-justify the count to avoid confusing the # field-count for subsequent data programs, yet still be justified # in a fashion which is human-readable fieldwidth = max(len(str(freqlist[0])), len(str(freqlist[-1]))) if config['quote']: format = "%%-%dd %s%%s%s\n" % \ (fieldwidth, config['quote'], config['quote']) else: format = "%%-%dd %%s\n" % fieldwidth lines_output = 0 for value in freqlist: # if the value is less than the histogram threshold, exit if config['reverse']: if value < config['count']: config['output'].close() return else: if config['count'] > 0 and value >= config['count']: config['output'].close() return for key in ld.iterkeys(): if ld[key] == value: if config['display'][0] == 'f': config['output'].write(format % (ld[key] / total, key)) else: config['output'].write(format % (ld[key], key)) # see whether we have output as many lines as specified, if > 0 lines_output += 1 if config['number'] > 0 and lines_output >= config['number']: config['output'].close() return if __name__ == '__main__': if 1: try: main() except KeyboardInterrupt: sys.stderr.write(" ...Control-C seen, quitting program.\n") else: import profile pr = profile.Profile(bias=2.0e-5) profile.run('main()', 'main.profile')