#!/usr/bin/env python3.3
'''
/*
 * Copyright (c) 2015, Charles McLouth
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 * 
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL STEWART LORD BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 auditconverter - This python script converts audit logs for ingestion into the
                  Helix Threat Detection analytics engine.
                  It converts all input files to be encoded as utf-8.
                  It converts all structured log formats to standard P4AUDIT format.
                  It splits output files in 2GB chunks
                  It optionally anonymizes user, client, host, and revisions
                  It optionally compresses output files
 
 see auditconverter -h for usage
 
 $Id: //guest/cmclouth/projects/auditconverter/src/auditconverter.py#32 $
 */
'''

import logging
import sys
import os
import argparse
import gzip
import sqlite3
import datetime
#import operator
import re

numbers_re = re.compile(r'(\d+)')

scriptversion = "1.0"
scriptname = os.path.basename(sys.argv[0])
# standard format: '%s %s@%s %s %s %s#%s'
# structured format: '6,,,%s,,,%s,%s,,%s,,,,%s,%s,%s'
# (self.f_date, self.f_user, self.f_client, self.f_host, self.f_action, self.f_file, self.f_rev)
P4AUDIT_RECORDFORMAT='%s %s@%s %s %s %s#%s'
STRUCTURED_RECORDFORMAT='6,,,%s,,,%s,%s,,%s,,,,%s,%s,%s'

def isDEBUG(record):
    return record.levelname == 'DEBUG'

def isINFO(record):
    return record.levelname == 'INFO'

def isWARN(record):
    return record.levelname == 'WARNING'

def isERROR(record):
    return record.levelname in ['ERROR', 'CRITICAL']

class AuditException(Exception):
    errorMessages = ['', # 0 : no error
                     'File:%s; Line:%d; Cannot determine codec.', # 1 : errCode, fileName, lineNo, lineBin
                     'File:%s; Line:%d; Ignoring empty line.', # 2 : errCode, fileName, lineNo
                     'File:%s; Line:%d; Is not a recognized structured audit log record format.', # 3 : errCode, fileName, lineNo, lineUTF8
                     'File:%s; Line:%d; user field is empty.', # 4 : errCode, fileName, lineNo, aRecord, lineBin
                     ]
    def __str__(self):
        #(errCode, lineBin, lineUTF8, linesRead, fileName) = self.args
        errCode = self.args[0]
        fileName = self.args[1]
        if fileName is None: fileName = 'unknown'
        lineNo = self.args[2]
        if lineNo is None: lineNo = -1
        if errCode == 0: return AuditException.errorMessages[errCode]
        elif errCode >= 1 and errCode <=4: return AuditException.errorMessages[errCode] % (fileName, lineNo)
        return ''

class UTF8Converter():
    def __init__(self):
        self.codecs = ['cp1252', 
            'cp1250', 'cp1251', 'cp1253', 'cp1254', 'cp1255', 'cp1256', 'cp1257', 'cp1258', 
            'latin_1', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', 
            'iso8859_7', 'iso8859_8', 'iso8859_9', 'iso8859_10', 
            'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_16', 
            'cp037', 'cp424', 'cp437', 'cp500', 'cp720', 
            'cp737', 'cp775', 'cp850', 'cp852', 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 
            'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'cp869', 'cp874', 'cp875', 
            'cp932', 'cp949', 'cp950', 'cp1006', 'cp1026', 'cp1140', 'cp65001', 
            'shift_jis', 'shift_jis_2004', 'shift_jisx0213', 'euc_jp', 'euc_jis_2004', 'euc_jisx0213', 
            'euc_kr', 'ascii', 'big5', 'big5hkscs', 'gb2312', 
            'gbk', 'gb18030', 'hz', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 
            'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 'johab', 'koi8_r', 'koi8_u', 
            'mac_cyrillic', 'mac_greek', 'mac_iceland', 'mac_latin2', 'mac_roman', 'mac_turkish', 
            'ptcp154', 'utf_32', 'utf_32_be', 'utf_32_le', 'utf_16', 'utf_16_be', 'utf_16_le', 
            'utf_7', 'utf_8_sig']
        self.codecMap = {'utf_8':0}
        self.codecSorted = ['utf_8']
        self.counter = 0
    def convert(self, lineBin):
        self.counter += 1
        lineEnc = None
        lineUTF8 = None
        codecName = None
        iCodec = 0
        while codecName is None and iCodec < len(self.codecSorted):
            try: 
                lineEnc = lineBin.decode(encoding=self.codecSorted[iCodec], errors='strict')
                codecName = self.codecSorted[iCodec]
            except UnicodeError: 
                iCodec += 1
                codecName = None
        iCodec = 0
        while codecName is None and iCodec < len(self.codecs):
            if self.codecs[iCodec] in self.codecSorted: 
                iCodec += 1
                continue
            try: 
                lineEnc = lineBin.decode(encoding=self.codecs[iCodec], errors='strict')
                codecName = self.codecs[iCodec]
            except UnicodeError: 
                iCodec += 1
                codecName = None
        if codecName is not None:
            #logger.debug('codec: %s' % codecName)
            if codecName == 'utf_8': lineUTF8 = lineEnc
            else: lineUTF8 = lineEnc.encode(encoding='utf_8').decode(encoding='utf_8')
            #increment use count
            if codecName in self.codecMap: self.codecMap[codecName] += 1
            else: self.codecMap[codecName] = 1
        # re-sort every 1000 iterations
        if len(self.codecMap) > 1 and self.counter % 1000 == 0: 
            self.codecSorted = sorted(list(self.codecMap.keys()), key=lambda k: self.codecMap[k], reverse=True)
            offset = 0
            while offset < len(self.codecs):
                if self.codecs[offset] in self.codecMap:
                    del(self.codecs[offset])
                else: offset += 1
        return (codecName, lineUTF8)

class Anonymizer():
    def __init__(self, database):
        # does the database exist
        self.hdbc = None
        self.hstmt = None
        self.__unames = {'user':0, 'client':0, 'host':0, 'project':0}
        self.__userMap = {}
        self.__clientMap = {}
        self.__hostMap = {}
        self.__projectMap = {}
        self.__dbVersion = 3

        self.__userRMap = {}
        self.__clientRMap = {}
        self.__hostRMap = {}
        self.__projectRMap = {}

        if database is not None:
            bInitDatabase = not os.path.isfile(database)
            self.hdbc = sqlite3.connect(database)
            self.hdbc.isolation_level = None
            self.hstmt = self.hdbc.cursor()
            if bInitDatabase: 
                # initialize blank database
                self.__initDatabase(self.hdbc)
            else:
                self.__migrateDatabase()
                # load maps from database
                # counters
                logger.debug('load from db: counters')
                sSql = 'SELECT n, v FROM counters'
                self.hstmt.execute(sSql)
                r = self.hstmt.fetchone()
                while r is not None:
                    self.__unames[r[0]] = r[1]
                    r = self.hstmt.fetchone()
                # userMap
                logger.debug('load from db: userMap')
                sSql = 'SELECT n, v FROM userMap'
                self.hstmt.execute(sSql)
                r = self.hstmt.fetchone()
                while r is not None:
                    self.__userMap[r[0]] = r[1]
                    self.__userRMap[r[1]] = r[0]
                    r = self.hstmt.fetchone()
                # clientMap
                logger.debug('load from db: clientMap')
                sSql = 'SELECT n, v FROM clientMap'
                self.hstmt.execute(sSql)
                r = self.hstmt.fetchone()
                while r is not None:
                    self.__clientMap[r[0]] = r[1]
                    self.__clientRMap[r[1]] = r[0]
                    r = self.hstmt.fetchone()
                # hostMap
                logger.debug('load from db: hostMap')
                sSql = 'SELECT n, v FROM hostMap'
                self.hstmt.execute(sSql)
                r = self.hstmt.fetchone()
                while r is not None:
                    self.__hostMap[r[0]] = r[1]
                    self.__hostRMap[r[1]] = r[0]
                    r = self.hstmt.fetchone()
                # projectMap
                logger.debug('load from db: projectMap')
                sSql = 'SELECT n, v FROM projectMap'
                self.hstmt.execute(sSql)
                r = self.hstmt.fetchone()
                while r is not None:
                    self.__projectMap[r[0]] = r[1]
                    self.__projectRMap[r[1]] = r[0]
                    r = self.hstmt.fetchone()

    def close(self):
        if self.hstmt is not None: self.hstmt.close()
        if self.hdbc is not None: self.hdbc.close()
    def __initDatabase(self, hdbc):
        sSql = 'CREATE TABLE counters (n TEXT, v INTEGER);'
        hdbc.execute(sSql)
        sSql = 'CREATE UNIQUE INDEX counters_uix ON counters (n)'
        hdbc.execute(sSql)
        sSql = "INSERT INTO counters (n,v) VALUES('user', 0)"
        hdbc.execute(sSql)
        sSql = "INSERT INTO counters (n,v) VALUES('client', 0)"
        hdbc.execute(sSql)
        sSql = "INSERT INTO counters (n,v) VALUES('host', 0)"
        hdbc.execute(sSql)
        sSql = "INSERT INTO counters (n,v) VALUES('project', 0)"
        hdbc.execute(sSql)
        sSql = "INSERT INTO counters (n,v) VALUES('dbversion', %d)" % self.__dbVersion
        hdbc.execute(sSql)
        sSql = 'CREATE TABLE userMap (n TEXT, v TEXT)'
        hdbc.execute(sSql)
        sSql = 'CREATE UNIQUE INDEX userMap_uix ON  userMap (n)'
        hdbc.execute(sSql)
        sSql = 'CREATE TABLE clientMap (n TEXT, v TEXT)'
        hdbc.execute(sSql)
        sSql = 'CREATE UNIQUE INDEX clientMap_uix ON  clientMap (n)'
        hdbc.execute(sSql)
        sSql = 'CREATE TABLE hostMap (n TEXT, v TEXT)'
        hdbc.execute(sSql)
        sSql = 'CREATE UNIQUE INDEX hostMap_uix ON  hostMap (n)'
        hdbc.execute(sSql)
        sSql = 'CREATE TABLE projectMap (n TEXT, v TEXT)'
        hdbc.execute(sSql)
        sSql = 'CREATE UNIQUE INDEX projectMap_uix ON  hostMap (n)'
        hdbc.execute(sSql)

    def getMappedUser(self, k):
        # always get from the map
        return self.__userMap.get(k)
    def getReverseMappedUser(self, k):
        # always get from the map
        return self.__userRMap.get(k)
    def __putMappedUser(self, k, v):
        # always put to the map
        self.__userMap[k] = v
        self.__userRMap[v] = k
        sSql = "INSERT INTO userMap (n, v) VALUES('%s', '%s')" % (k, v)
        self.hdbc.execute(sSql)

    def getMappedClient(self, k):
        # always get from the map
        return self.__clientMap.get(k)
    def getReverseMappedClient(self, k):
        # always get from the map
        return self.__clientRMap.get(k)
    def __putMappedClient(self, k, v):
        # always put to the map
        self.__clientMap[k] = v
        self.__clientRMap[v] = k
        sSql = "INSERT INTO clientMap (n, v) VALUES('%s', '%s')" % (k, v)
        self.hdbc.execute(sSql)

    def getMappedHost(self, k):
        # always get from the map
        return self.__hostMap.get(k)
    def getReverseMappedHost(self, k):
        # always get from the map
        return self.__hostRMap.get(k)
    def __putMappedHost(self, k, v):
        # always put to the map
        self.__hostMap[k] = v
        self.__hostRMap[v] = k
        sSql = "INSERT INTO hostMap (n, v) VALUES('%s', '%s')" % (k, v)
        self.hdbc.execute(sSql)

    def getMappedProject(self, k):
        # always get from the map
        return self.__projectMap.get(k)
    def getReverseMappedProject(self, k):
        # always get from the map
        return self.__projectRMap.get(k)
    def __putMappedProject(self, k, v):
        # always put to the map
        self.__projectMap[k] = v
        self.__projectRMap[v] = k
        sSql = "INSERT INTO projectMap (n, v) VALUES('%s', '%s')" % (k, v)
        self.hdbc.execute(sSql)

    def __anonymize_user(self, data):
        result = self.getMappedUser(data)
        if result is None:
            result = self.__genUniqueName('user', 'User')
            self.__putMappedUser(data, result)
        return result

    def __anonymize_client(self, data):
        result = self.getMappedClient(data)
        if result is None:
            result = self.__genUniqueName('client', 'Client')
            self.__putMappedClient(data, result)
        return result

    def __anonymize_host(self, data):
        result = self.getMappedHost(data)
        if result is None:
            result = self.__genUniqueName('host', '1.1.1.')
            self.__putMappedHost(data, result)
        return result

    def __anonymize_depotfile(self, data):
        # convert depotfile to project
        parts = data.split('/')
        # ignore the filename
        del(parts[-1])
        if len(parts) > 5: del(parts[5:])
        projectNameIn = '/'.join(parts)
        result = self.getMappedProject(projectNameIn)
        if result is None:
            # The format for the Project name must look like a file and path
            # //Project%d/file.ext
            result = '//%s/file.ext' % self.__genUniqueName('project', 'Project')
            self.__putMappedProject(projectNameIn, result)
        return result

    def __genUniqueName(self, classType, prefix):
        self.__unames[classType] += 1
        sSql = "UPDATE counters SET v=%d WHERE n='%s'" % (self.__unames[classType], classType)
        self.hdbc.execute(sSql)
        return prefix + str(self.__unames[classType])

    def __migrateDatabase(self):
        sSql = "SELECT n, v FROM counters WHERE n = 'dbversion'"
        self.hstmt.execute(sSql)
        r = self.hstmt.fetchone()
        dbVersion = 0
        if r is not None: dbVersion = r[1]
        while dbVersion < self.__dbVersion:
            logger.debug('Migrate database from version:%d to version: %d' % (dbVersion, dbVersion+1))
            if dbVersion == 0:
                # from 0 to 1 we add the dbversion to the counter table
                sSql = "INSERT INTO counters (n,v) VALUES('dbversion', %d)" % (dbVersion+1)
                self.hdbc.execute(sSql)
            elif dbVersion == 1:
                # from 1 to 2 we change the convention of the host
                # from Host%d to 1.1.1.%d
                hostMap = {}
                # hostMap
                sSql = 'SELECT n, v FROM hostMap'
                self.hstmt.execute(sSql)
                r = self.hstmt.fetchone()
                while r is not None:
                    logger.debug('fetching: %s' % str(r))
                    k = r[0]
                    vOld = r[1]
                    v = '1.1.1.' + vOld[4:]
                    hostMap[k] = v
                    r = self.hstmt.fetchone()
                sSql = 'DELETE FROM hostMap'
                self.hdbc.execute(sSql)
                for (k,v) in hostMap.items():
                    logger.debug('inserting: (%s,%s)' % (k,v))
                    sSql = "INSERT INTO hostMap (n, v) VALUES('%s', '%s')" % (k, v)
                    self.hdbc.execute(sSql)
                sSql = "UPDATE counters SET v = %d WHERE n = 'dbversion'" % (dbVersion+1)
                self.hdbc.execute(sSql)
            elif dbVersion == 2:
                # from 2 to 3 we change the convention of the Project
                # from Project%d to //Project%d/file.ext
                projectMap = {}
                # projectMap
                sSql = 'SELECT n, v FROM projectMap'
                self.hstmt.execute(sSql)
                r = self.hstmt.fetchone()
                while r is not None:
                    logger.debug('fetching: %s' % str(r))
                    k = r[0]
                    vOld = r[1]
                    v = '//%s/file.ext' %  vOld
                    projectMap[k] = v
                    r = self.hstmt.fetchone()
                sSql = 'DELETE FROM projectMap'
                self.hdbc.execute(sSql)
                for (k,v) in projectMap.items():
                    logger.debug('inserting: (%s,%s)' % (k,v))
                    sSql = "INSERT INTO projectMap (n, v) VALUES('%s', '%s')" % (k, v)
                    self.hdbc.execute(sSql)
                sSql = "UPDATE counters SET v = %d WHERE n = 'dbversion'" % (dbVersion+1)
                self.hdbc.execute(sSql)

            dbVersion += 1

    def anonymizeRecord(self, aRecordIn):
        bUser = True
        bClient = True
        bHost = True
        bDepotFile = True
        aRecordOut = AuditLogLine()
        aRecordOut.f_date = aRecordIn.f_date
        aRecordOut.f_user = aRecordIn.f_user
        aRecordOut.f_client = aRecordIn.f_client
        aRecordOut.f_host = aRecordIn.f_host
        aRecordOut.f_action = aRecordIn.f_action
        aRecordOut.f_file = aRecordIn.f_file
        aRecordOut.f_rev = aRecordIn.f_rev

        if bUser: aRecordOut.f_user = self.__anonymize_user(aRecordIn.f_user)
        if bClient: aRecordOut.f_client = self.__anonymize_client(aRecordIn.f_client)
        if bHost: aRecordOut.f_host = self.__anonymize_host(aRecordIn.f_host)
        if bDepotFile: aRecordOut.f_file = self.__anonymize_depotfile(aRecordIn.f_file)
        return aRecordOut

    def dump(self, outputFile):
        fWrite = open(outputFile, mode='wt', encoding='utf_8')
        fWrite.write('This file contains the mappings between the real user and project IDs in your\n'\
            'log files, and the anonymized identifiers used in the analytics.\n\n'\
            'This report is organized into the following lists:\n\n'\
            '* Mapping from real user name to anonymized user identifier\n'\
            '* Mapping from real client name to anonymized client identifier\n'\
            '* Mapping from real host/ip name to anonymized host/ip identifier\n'\
            '* Mapping from real project name to anonymized project identifier\n')
        # by User ID
        fWrite.write('\n### User ID = Anonymized Identifier\n\n')
        sSql = 'SELECT n, v FROM userMap ORDER BY n'
        self.hstmt.execute(sSql)
        r = self.hstmt.fetchone()
        while r is not None:
            n = r[0]
            if n is None or len(n) < 1: n = '(none)'
            v = r[1]
            if v is None or len(v) < 1: v = '(none)'
            fWrite.write('   %s = %s\n' % (n, v))
            r = self.hstmt.fetchone()
        # by Client ID
        fWrite.write('\n### Client ID = Anonymized Identifier\n\n')
        sSql = 'SELECT n, v FROM clientMap ORDER BY n'
        self.hstmt.execute(sSql)
        r = self.hstmt.fetchone()
        while r is not None:
            n = r[0]
            if n is None or len(n) < 1: n = '(none)'
            v = r[1]
            if v is None or len(v) < 1: v = '(none)'
            fWrite.write('   %s = %s\n' % (n, v))
            r = self.hstmt.fetchone()
        # by Host ID
        fWrite.write('\n### Host ID = Anonymized Identifier\n\n')
        sSql = 'SELECT n, v FROM hostMap ORDER BY n'
        self.hstmt.execute(sSql)
        r = self.hstmt.fetchone()
        while r is not None:
            n = r[0]
            if n is None or len(n) < 1: n = '(none)'
            v = r[1]
            if v is None or len(v) < 1: v = '(none)'
            fWrite.write('   %s = %s\n' % (n, v))
            r = self.hstmt.fetchone()
        # by Project ID
        fWrite.write('\n### Project ID = Anonymized Identifier\n\n')
        sSql = 'SELECT n, v FROM projectMap ORDER BY n'
        self.hstmt.execute(sSql)
        r = self.hstmt.fetchone()
        while r is not None:
            n = r[0]
            if n is None or len(n) < 1: n = '(none)'
            v = r[1]
            if v is None or len(v) < 1: v = '(none)'
            fWrite.write('   %s = %s\n' % (n, v))
            r = self.hstmt.fetchone()

        fWrite.close()

    def load(self, inputFile):
        fRead = open(inputFile, mode='rt', encoding='utf_8')
        sSql = "DELETE FROM userMap"
        self.hdbc.execute(sSql)
        sSql = "DELETE FROM clientMap"
        self.hdbc.execute(sSql)
        sSql = "DELETE FROM hostMap"
        self.hdbc.execute(sSql)
        sSql = "DELETE FROM projectMap"
        self.hdbc.execute(sSql)
        mapFunction = None
        mapName = None
        lineCount = 0
        rowCount = 0
        for line in fRead:
            lineCount += 1
            op = line[0:3]
            if op == '###':
                if mapName is not None:
                    logger.info('loaded %d records into table %s' % (rowCount, mapName))
                if line.startswith('### User ID'):
                    mapFunction = self.__putMappedUser
                    mapName = 'User'
                elif line.startswith('### Client ID'):
                    mapFunction = self.__putMappedClient
                    mapName = 'Client'
                elif line.startswith('### Host ID'):
                    mapFunction = self.__putMappedHost
                    mapName = 'Host'
                elif line.startswith('### Project ID'):
                    mapFunction = self.__putMappedProject
                    mapName = 'Project'
                else:
                    mapFunction = None
                    mapName = None
                rowCount = 0
                if mapName is not None:
                    logger.info('Processing %s records' % mapName)
            elif op == '   ' and mapFunction is not None:
                parts = line.strip().split('=', 1)
                n = parts[0].strip()
                v = parts[1].strip()
                mapFunction(n,v)
                rowCount += 1
                if rowCount % 100 == 0: logger.debug('%s:rowcount:%d' % (mapName, rowCount))
        if mapName is not None:
            logger.info('loaded %d records into table %s' % (rowCount, mapName))
        self.__unames = {'user':len(self.__userMap), 'client':0, 'host':0, 'project':0}
        self.__unames['user'] = len(self.__userMap)
        self.__unames['client'] = len(self.__clientMap)
        self.__unames['host'] = len(self.__hostMap)
        self.__unames['project'] = len(self.__projectMap)
        for (n,v) in self.__unames.items():
            sSql = "UPDATE counters SET v=%d WHERE n='%s'" % (v, n)
            self.hdbc.execute(sSql)

#         fWrite.write('\n### User ID = Anonymized Identifier\n\n')
#         fWrite.write('\n### Client ID = Anonymized Identifier\n\n')
#         fWrite.write('\n### Host ID = Anonymized Identifier\n\n')
#         fWrite.write('\n### Project ID = Anonymized Identifier\n\n')

        fRead.close()

    def debugMaps(self):
        logger.debug('len(self.__userMap):%d' % len(self.__userMap))
        logger.debug('len(self.__clientMap):%d' % len(self.__clientMap))
        logger.debug('len(self.__hostMap):%d' % len(self.__hostMap))
        logger.debug('len(self.__projectMap):%d' % len(self.__projectMap))

def fileCompareKey(value):
    parts = numbers_re.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

def sortFiles(fileListIn):
    return sorted(fileListIn, key=fileCompareKey)

def processInputParams(pargs=sys.argv):
    '''
    process commandline arguments and run function
    '''
    gParser = argparse.ArgumentParser()
    gParser.description="This python script converts audit logs for ingesting into the Helix Threat Detection analytics engine.\n"\
        "It converts all input files to be encoded as utf-8, converts all structured log formats to standard P4AUDIT format."\
        "Optionally it will anonymize the data in the output log files and compress them."
    gParser.add_argument('-V', '--version', action='version', version='%(prog)s ' + scriptversion)
    gParser.add_argument('-i', '--input', dest='input', metavar='input', \
        help='a directory of audit logs to convert.')
    gParser.add_argument('-o', '--output', dest='output', metavar='output', \
        help='a directory to write converted log files to.')
    gParser.add_argument('-c', '--compress', dest='compress', action='store_true', \
        help='compress output log files with gzip compatible compression.')
    gParser.add_argument('-d', '--database', dest='database', metavar='database', \
        help='path to anonymization database file.')
    gParser.add_argument('-a', '--anonymization-map', dest='anonymizationmap', metavar='anonymization-map', \
        help='output file of anonymization-map.')
    gParser.add_argument('-C', '--case-sensitive', dest='casesensitive', action='store_true', default=False, \
        help='format as case-sensitive.')
    gParser.add_argument('-s', '--size', dest='maxSize', metavar='maxSize', \
        type=int, default=2048, \
        help='Maximum size of output files in MB (default:2048.) Specify zero (0) for no limit.')
#     gParser.add_argument('-m', '--ipmap', dest='ipmap', action='store_true', \
#         help='generate a map of ip addresses to user/client.')
    gParser.add_argument('-v', '--novalidate', dest='validate', action='store_false', default=True, \
        help='Do not validate the data of audit records.')
    gParser.add_argument('-f', '--format', dest='logformat', metavar='logformat', \
        type=int, default=0, \
        help='Output record format. 0 (zero) for P4AUDIT 1 (one) for Structured Audit Log.')
    args = gParser.parse_args(pargs)
    # must have i or a or both
    if (not hasattr(args, 'input') or args.input is None) \
            and (not hasattr(args, 'anonymizationmap') or args.anonymizationmap is None): 
        gParser.print_help()
        gParser.error('input (-i) or anonymization-map (-a) are required.')
    # if i then o
    if (hasattr(args, 'input') and args.input is not None) \
            and hasattr(args, 'output') and args.output is None:
        gParser.print_help()
        gParser.error('output (-o) is required.')
    # if a then d
    if (hasattr(args, 'anonymizationmap') and args.anonymizationmap is not None) \
            and hasattr(args, 'database') and args.database is None:
        gParser.print_help()
        gParser.error('database (-d) is required.')
    # validate inputs
    if hasattr(args, 'input') and args.input is not None:
        if not os.path.isdir(args.input):
            gParser.print_help()
            gParser.error("invalid input directory (-i) '%s'" % (args.input))
    if hasattr(args, 'output') and args.output is not None:
        if not os.path.isdir(args.output):
            gParser.print_help()
            gParser.error("invalid output directory (-o) '%s'" % (args.output))
    return args

class AuditFileIO(object):
    '''
    Abstration for reading/writing to/from a file
    '''
    def __init__(self, fileName, forRead=True, caseSensitive=False, utf8converter=None, maxWriteSize=None):
        self.fileName = fileName
        self.isCompressed = False
        self.linesRead = 0
        self.linesIgnored = 0
        self.bytesRead = 0
        self.linesWritten = 0
        self.bytesWritten = 0
        self.bytesWrittenPart = 0
        self.forRead = forRead
        self.firstLineRead = False
        self.caseSensitive = caseSensitive
        self.utf8converter = utf8converter
        self.structuredFormat = False
        self.maxWriteSize = maxWriteSize
        self.splitfiles = False
        self.parts = 0
        if fileName.endswith('.gz'): self.isCompressed = True
        if self.isCompressed: 
            if self.forRead: self.f = gzip.open(self.fileName, 'rb')
            else: self.f = gzip.open(self.fileName, 'wb')
        else: 
            if self.forRead: self.f = open(fileName, mode='r+b')
            else: self.f = open(self.fileName, 'w+b')
        self.splitfiles = self.maxWriteSize is not None and self.maxWriteSize > 0 and not self.isCompressed

    def readRecord(self, bValidateRecord=True):
        if not self.forRead: return None
        lineBin = self.f.readline()
        if lineBin is not None and len(lineBin) > 0:
            self.linesRead += 1
            self.bytesRead += len(lineBin)
        else: return None
        (codecName, lineUTF8) = self.utf8converter.convert(lineBin)
        if codecName is None:
            raise AuditException(1, self.fileName, self.linesRead, lineBin)
        else:
            if codecName != 'utf_8': logger.debug('file: %s - converted line %d from %s to utf_8' % (self.fileName, self.linesRead, codecName))
            lineUTF8 = lineUTF8.rstrip()
            if len(lineUTF8) < 1:
                self.linesIgnored += 1
                raise AuditException(2, self.fileName, self.linesRead)
        if not self.caseSensitive:
            # convert all data to lowercase
            lineUTF8 = lineUTF8.lower()
        if not self.firstLineRead and len(lineUTF8) > 0:
            self.firstLineRead = True
            # test format
            self.structuredFormat = len(lineUTF8.split(',')) > 2
        aRecord = AuditLogLine()
        try:
            aRecord.setLine(lineUTF8, self.structuredFormat)
            if bValidateRecord: aRecord.validateRecord()
        except AuditException as e:
            errCode = e.args[0]
            if errCode == 1:
                raise AuditException(errCode, self.fileName, self.linesRead, lineBin)
            elif errCode == 2:
                raise AuditException(errCode, self.fileName, self.linesRead)
            elif errCode == 3:
                raise AuditException(errCode, self.fileName, self.linesRead, lineUTF8)
            elif errCode == 4:
                raise AuditException(errCode, self.fileName, self.linesRead, aRecord, lineBin)
            else: raise e
        return aRecord

    def writeLine(self, lineUTF8):
        if self.forRead: return 0
        # add \n
        lineUTF8 = lineUTF8 + '\n'
        lineBin = lineUTF8.encode(encoding='utf_8', errors='strict')
        lineBytes = len(lineBin)
        # Does this exceed the max filesize?
        if self.splitfiles and lineBytes + self.bytesWrittenPart > self.maxWriteSize:
            # close the file
            self.f.close()
            # rename the file
            if self.isCompressed:
                newFileName = "%s.%03d.%s" % (self.fileName[0:len(self.fileName)-3], self.parts, 'gz')
            else:
                newFileName = "%s.%03d" % (self.fileName, self.parts)
            os.replace(self.fileName, newFileName)
            self.parts += 1
            self.bytesWrittenPart = 0
            # open new file
            if os.path.isfile(self.fileName):
                logger.error("rollover file still exists.")
            if self.isCompressed: 
                self.f = gzip.open(self.fileName, 'wb')
            else: 
                self.f = open(self.fileName, 'w+b')
        bytesWritten = self.f.write(lineBin)
        if bytesWritten > 0:
            self.linesWritten += 1
            self.bytesWritten += bytesWritten
            self.bytesWrittenPart += bytesWritten
        return bytesWritten

    def close(self):
        # close the file
        fReturn = self.f.close()
        if self.splitfiles and self.parts > 0:
            # rename the file
            if self.isCompressed:
                newFileName = "%s.%03d.%s" % (self.fileName[0:len(self.fileName)-3], self.parts, 'gz')
            else:
                newFileName = "%s.%03d" % (self.fileName, self.parts)
            os.replace(self.fileName, newFileName)
            self.parts += 1
            self.bytesWrittenPart = 0

        self.fileName = None
        self.isCompressed = False
        self.linesRead = 0
        self.linesIgnored = 0
        self.bytesRead = 0
        self.linesWritten = 0
        self.bytesWritten = 0
        self.bytesWrittenPart = 0
        self.forRead = None
        self.firstLineRead = None
        self.caseSensitive = None
        self.structuredFormat = False
        self.parts = 0
        self.splitfiles = False
        return fReturn

class AuditLogLine(object):
    '''
    Audit log record
    standard format: '%s %s@%s %s %s %s#%s'
    structured format: '6,,,%s,,,%s,%s,,%s,,,,%s,%s,%s'
    (self.f_date, self.f_user, self.f_client, self.f_host, self.f_action, self.f_file, self.f_rev)
    '''
    def __init__(self, rowformat='%s %s@%s %s %s %s#%s'):
        self.f_date = ''
        self.f_user = ''
        self.f_client = ''
        self.f_host = ''
        self.f_action = ''
        self.f_file = ''
        self.f_rev = ''
        self.rowformat = rowformat

    def setLine(self, data, structuredLog=False):
        self.f_date = ''
        self.f_user = ''
        self.f_client = ''
        self.f_host = ''
        self.f_action = ''
        self.f_file = ''
        self.f_rev = ''
        # f_date f_user@f_client f_host f_action f_file#f_rev
        fields = []
        if data is None or len(data) < 1:
            # file and line# set by caller
            raise AuditException(2, None, None)
        elif structuredLog:
            ieventtype=0
            idate=3
            iuser=6
            iclient=7
            ihost=9
            iaction=13
            ifile=14
            irev=15
            fields = data.split(',')
            if len(fields) > irev and fields[ieventtype] == '6':
                dateParts = fields[idate].split(' ')
                self.f_date = "%s %s" % (dateParts[0], dateParts[1])
                self.f_user = fields[iuser]
                self.f_client = fields[iclient]
                self.f_host = fields[ihost]
                self.f_action = fields[iaction]
                self.f_file = fields[ifile]
                # trip the revision specifier
                if fields[irev][0] == '#': self.f_rev = fields[irev][1:] 
                else: self.f_rev = fields[irev]
            else:
                # file and line# set by caller
                raise AuditException(3, None, None, data)
        else:
            offset = 0
            for field in data.split(maxsplit=5):
                if offset == 2:
                    for f in field.split('@', maxsplit=1): fields.append(f)
                elif offset == 5:
                    for f in field.split('#', maxsplit=1): fields.append(f)
                    # found a weird case where no # revision field. add empty string
                    if len(fields) < 8: fields.append('')
                else: fields.append(field)
                offset += 1
            self.f_date = "%s %s" % (fields[0], fields[1])
            self.f_user = fields[2]
            self.f_client = fields[3]
            self.f_host = fields[4]
            self.f_action = fields[5]
            self.f_file = fields[6]
            self.f_rev = fields[7]
        return True

    def getLine(self, rowformat=None):
        if rowformat is not None: self.rowformat=rowformat
        return self.__str__()
    def __str__(self):
        # f_date f_user@f_client f_host f_action f_file#f_rev
        return self.rowformat % (self.f_date, self.f_user, self.f_client, self.f_host, self.f_action, self.f_file, self.f_rev)

    def validateRecord(self):
        if self.f_user is None or len(self.f_user) < 1:
            # file and line# set by caller
            sError = '%s\n' % str(self)
            raise AuditException(4, None, None, self, sError.encode(encoding='utf_8', errors='strict'))

class HostMRU(object):
    '''
    MRU Host/User combination used for fixing problem records.
    '''
    def __init__(self):
        self.hostMap = {}
        self.hostusermap = {}

    def addRecord(self, aRecord):
        if aRecord is None or aRecord.f_host is None or len(aRecord.f_host) < 1 or aRecord.f_host == 'unknown': return False
        self.hostMap[aRecord.f_host] = (aRecord.f_user, aRecord.f_client)
        if aRecord.f_host not in self.hostusermap: self.hostusermap[aRecord.f_host] = {}
        self.hostusermap[aRecord.f_host][aRecord.f_user] = self.hostusermap[aRecord.f_host].get(aRecord.f_user, 0) + 1
        return True

    def fixRecord(self, aRecord):
        if aRecord is None or aRecord.f_host is None or len(aRecord.f_host) < 1 or aRecord.f_host == 'unknown': return False
        (user, client) = self.hostMap.get(aRecord.f_host, (None, None))
        if aRecord.f_client is None or len(aRecord.f_client) < 1 or aRecord.f_client == 'unknown': aRecord.f_client = client
        if aRecord.f_user is None or len(aRecord.f_user) < 1 or aRecord.f_user == 'unknown': aRecord.f_user = user
        if aRecord.f_user is None or len(aRecord.f_user) < 1 or aRecord.f_user == 'unknown': return False
        return True

if __name__ == '__main__':
    logger = logging.getLogger(scriptname)
    logger.propagate = False
    logger.setLevel(logging.INFO)
    debugHandler = logging.StreamHandler()
    #debugHandler = logging.FileHandler('mergetracker.out')
    debugHandler.setLevel(logging.DEBUG)
    debugHandler.setFormatter(logging.Formatter('%(levelname)s:%(filename)s:%(lineno)d:%(funcName)s:%(message)s'))
    debugHandler.addFilter(isDEBUG)
    logger.addHandler(debugHandler)
    infoHandler = logging.StreamHandler(sys.stdout)
    infoHandler.setLevel(logging.INFO)
    infoHandler.setFormatter(logging.Formatter('%(message)s'))
    infoHandler.addFilter(isINFO)
    logger.addHandler(infoHandler)
    warnHandler = logging.StreamHandler()
    warnHandler.setLevel(logging.WARN)
    warnHandler.setFormatter(logging.Formatter('%(message)s'))
    warnHandler.addFilter(isWARN)
    logger.addHandler(warnHandler)
    errorHandler = logging.StreamHandler()
    errorHandler.setLevel(logging.ERROR)
    errorHandler.setFormatter(logging.Formatter('%(message)s'))
    errorHandler.addFilter(isERROR)
    logger.addHandler(errorHandler)

    args = processInputParams(sys.argv[1:])
    anonimizer = Anonymizer(args.database)
    if args.logformat == 1: outputRecordFormat = STRUCTURED_RECORDFORMAT
    else: outputRecordFormat = P4AUDIT_RECORDFORMAT
    recordFixer = HostMRU()
    if hasattr(args, 'input') and args.input is not None:
        fileList = []
        for fileName in os.listdir(args.input):
            if not os.path.isfile(os.path.join(args.input, fileName)): continue
            fileList.append(fileName)
        if len(fileList) < 1:
            logger.info('No files to process in -i %s' % (args.input))
        # sort filenames
        sortedFiles = sortFiles(fileList)
        fileList.clear()
        for fileName in sortedFiles:
            fileList.append(os.path.join(args.input, fileName))
        utf8converter = UTF8Converter()
        #maxWriteSize = 10485760
        maxWriteSize = 1048576 * args.maxSize
        if maxWriteSize < 1: maxWriteSize = None
        fError = None
        for fileOffset in range(0, len(fileList)):
            fileName = fileList[fileOffset]
            fRead = None
            fWrite = None
            charsWritten = 0
            fileNameOut = None
            dtStart = datetime.datetime.now()
            try:
                fileNameOut = os.path.basename(fileName)
                if fileName.endswith('.gz'): 
                    baseName = os.path.basename(fileName)
                    fileNameOut = baseName[0:len(baseName)-3]
                fileNameOut += '.utf8'
                if args.database: fileNameOut += '.anonymized'
                if args.compress: fileNameOut += '.gz'
                fileNameOut = os.path.join(args.output, fileNameOut)
                fRead = AuditFileIO(fileName, True, args.casesensitive, utf8converter, None)
                fWrite = AuditFileIO(fileNameOut, False, args.casesensitive, utf8converter, maxWriteSize)
                if fWrite.splitfiles:
                    logger.info('Processing file: %s (%d of %d) converting to: %s (splitfile enabled)' % (fRead.fileName, fileOffset+1, len(fileList), fWrite.fileName))
                    logger.debug('splitfiles enabled with maxSize of %d bytes' % fWrite.maxWriteSize)
                else:
                    logger.info('Processing file: %s (%d of %d) converting to: %s' % (fRead.fileName, fileOffset+1, len(fileList), fWrite.fileName))
                while True:
                    try:
                        try:
                            aRecord = fRead.readRecord(args.validate)
                            # record passed validation so add it to the MRU
                            recordFixer.addRecord(aRecord)
                        except AuditException as e:
                            errCode = e.args[0]
                            if errCode != 4: raise e
                            aRecord = e.args[3]
                            # record failed validation try data in MRU
                            if not recordFixer.fixRecord(aRecord): raise e
                            aRecord.validateRecord()
                        # eof
                        if aRecord is None: break
                        # anonymize
                        if args.database:
                            # Helix Threat Detection is Case Insensitive for files. If we are anonymizing the data then force case insentive
                            aRecord.f_client = aRecord.f_client.lower()
                            aRecord.f_file = aRecord.f_file.lower()
                            aRecord.f_user = aRecord.f_user.lower()
                            aRecord = anonimizer.anonymizeRecord(aRecord)
                        # write output
                        fWrite.writeLine(aRecord.getLine(outputRecordFormat))
                    except AuditException as e:
                        errCode = e.args[0]
                        fileName = e.args[1]
                        lineNo = e.args[2]
                        logger.error(str(e))
                        if errCode in [1,3,4]:
                            #write bad line to error file
                            if fError is None:
                                fileNameErr = os.path.join(args.output, "auditconverter.err")
                                fError = open(fileNameErr, 'a+b')
                                fError.write('\n'.encode(encoding='utf_8', errors='strict'))
                            if errCode == 1: lineBin = e.args[3]
                            elif errCode == 3: lineBin = ("%s\n" % (e.args[3])).encode(encoding='utf_8', errors='strict')
                            elif errCode == 4: lineBin = e.args[4]
                            prefix = 'file=%s;line=%d;err=%d;:' % (fileName, lineNo, errCode)
                            fError.write(prefix.encode(encoding='utf_8', errors='strict'))
                            fError.write(lineBin)
#                             if errCode == 4:
#                                 # dump recordFixer
#                                 for k in sorted(recordFixer.hostMap.keys(), key=fileCompareKey):
#                                     sDebug = 'MRU;%s:%s\n' % (k,recordFixer.hostMap.get(k))
#                                     fError.write(sDebug.encode(encoding='utf_8', errors='strict'))
#                                 raise e
                if fRead.linesRead != fWrite.linesWritten: logger.warn('Lines read: %d; Lines written: %d' % (fRead.linesRead, fWrite.linesWritten))
                dtStop = datetime.datetime.now()
                seconds = (dtStop-dtStart).seconds
                if seconds == 0: seconds = 1
                if fWrite.splitfiles and fWrite.parts > 0:
                    logger.info('Completed: %s (%d of %d) converting to: %s (%d splitparts) with %d lines in %d seconds (%d lines/second.)' % (fRead.fileName, fileOffset+1, len(fileList), fWrite.fileName, fWrite.parts, fRead.linesRead, seconds, int(fRead.linesRead/seconds)))
                else:
                    logger.info('Completed: %s (%d of %d) converting to: %s with %d lines in %d seconds (%d lines/second.)' % (fRead.fileName, fileOffset+1, len(fileList), fWrite.fileName, fRead.linesRead, seconds, int(fRead.linesRead/seconds)))
            except:
                logger.exception('unknown exception processing input file: %s' % (fileName))
            finally:
                if fRead is not None: fRead.close()
                if fWrite is not None: fWrite.close()
            anonimizer.debugMaps()
        if fError is not None: fError.close()
    if hasattr(args, 'anonymizationmap') and args.anonymizationmap is not None:
#         if args.load:
#             logger.info('load database from anonymization map: %s' % args.anonymizationmap)
#             anonimizer.load(args.anonymizationmap)
#         else:
        logger.info('writing anonymization map to: %s' % args.anonymizationmap)
        anonimizer.dump(args.anonymizationmap)
    anonimizer.close()