import json import sqlite3 import tempfile import os import time import pandas as pd import geoip2.database import requests from dateutil import parser, tz import pandas as pd import json import csv from pathlib import Path start_time=0 end_time=0 password_spray_query = ''' WITH FailedLogins AS ( SELECT UserId, ClientIP, datetime(CreationTime) AS LoginDate FROM events WHERE Operation = 'UserLoginFailed' ) SELECT UserId, GROUP_CONCAT(ClientIP, ', ') AS ClientIPs, COUNT(DISTINCT ClientIP) AS UniqueIPCount, COUNT(*) AS FailedLoginAttempts, LoginDate FROM FailedLogins GROUP BY UserId, strftime('%Y-%m-%d %H', LoginDate) HAVING COUNT(*) > 5 AND UniqueIPCount > 3 ORDER BY FailedLoginAttempts DESC; ''' user_logon_query = ''' SELECT UserId, date(CreationTime) AS LoginDate, COUNT(*) AS TotalLoginAttempts, SUM(CASE WHEN Operation = 'UserLoggedIn' THEN 1 ELSE 0 END) AS SuccessfulLogins, SUM(CASE WHEN Operation = 'UserLoginFailed' THEN 1 ELSE 0 END) AS FailedLogins FROM events where Operation = 'UserLoggedIn' OR Operation = 'UserLoginFailed' GROUP BY UserId, LoginDate ORDER BY LoginDate, UserId; ''' User_operations_query = ''' SELECT UserId, COUNT(DISTINCT Operation) AS OperationCount, GROUP_CONCAT(Operation, ', ') AS UniqueOperations FROM (SELECT DISTINCT UserId, Operation FROM events) GROUP BY UserId ORDER BY OperationCount DESC; ''' user_operation_by_day_query = ''' SELECT UserId, DATE(CreationTime) AS OperationDate, COUNT(DISTINCT Operation) AS OperationCount, GROUP_CONCAT( Operation, ', ') AS UniqueOperations FROM events GROUP BY UserId, OperationDate ORDER BY OperationCount DESC ''' def convert_csv(input_file,temp): with open(input_file, 'r', encoding='utf-8') as csv_file: # Create a CSV reader reader = csv.DictReader(csv_file) json_file = 'audit_data.json' json_file=os.path.join(temp, json_file) with open(json_file, 'w', encoding='utf-8') as jsonl_file: # Extract and write the AuditData column to a file as JSON Lines for row in reader: # Extract the AuditData which is already a JSON formatted string json_data = json.loads(row['AuditData']) # Convert the JSON object back to a string to store in the file json_string = json.dumps(json_data) # Write the JSON string to the file with a newline jsonl_file.write(json_string + '\n') return json_file def flatten_json_file(input_file, timezone, chunk_size=10000): # Read the JSON file in chunks chunks = [] with open(input_file, 'r') as file: lines = file.readlines() for i in range(0, len(lines), chunk_size): chunk = [json.loads(line) for line in lines[i:i + chunk_size]] # Convert the CreationTime to the desired timezone for record in chunk: if 'CreationTime' in record: # Parse the CreationTime creation_time = parser.parse(record['CreationTime']) # Check if the datetime object is timezone aware if creation_time.tzinfo is None: # Assume the original time is in UTC if no timezone info is present creation_time = creation_time.replace(tzinfo=tz.tzutc()) # Convert the CreationTime to the desired timezone record['CreationTime'] = creation_time.astimezone(timezone).isoformat() chunks.append(pd.json_normalize(chunk)) # Concatenate all chunks into a single DataFrame flattened_records = pd.concat(chunks, ignore_index=True) return flattened_records def create_sqlite_db_from_dataframe(dataframe, db_name): conn = sqlite3.connect(db_name) # Convert all columns to string dataframe = dataframe.astype(str) # Write the DataFrame to SQLite, treating all fields as text dataframe.to_sql('events', conn, if_exists='replace', index=False, dtype={col_name: 'TEXT' for col_name in dataframe.columns}) conn.close() def read_detection_rules(rule_file): with open(rule_file, 'r') as file: rules = json.load(file) return rules def apply_detection_logic_sqlite(db_name, rules): conn = sqlite3.connect(db_name) all_detected_events = [] for rule in rules: rule_name = rule['name'] severity = rule['severity'] query = rule['query'] detected_events = pd.read_sql_query(query, conn) detected_events['RuleName'] = rule_name detected_events['Severity'] = severity all_detected_events.append(detected_events) conn.close() if all_detected_events: result = pd.concat(all_detected_events, ignore_index=True) else: result = pd.DataFrame() return result def download_geolite_db(geolite_db_path): url = "https://git.io/GeoLite2-Country.mmdb" print(f"Downloading GeoLite2 database from {url}...") response = requests.get(url) response.raise_for_status() # Check if the download was successful with open(geolite_db_path, 'wb') as file: file.write(response.content) print(f"GeoLite2 database downloaded and saved to {geolite_db_path}") def get_country_from_ip(ip, reader): try: response = reader.country(ip) return response.country.name except Exception as e: #print(f"Could not resolve IP {ip}: {e}") return 'Unknown' def analyzeoff365(auditfile, rule_file, output, timezone, include_flattened_data=False, geolite_db_path='GeoLite2-Country.mmdb'): start_time = time.time() temp_dir = ".temp" if output is None or output == "": output = os.path.splitext(auditfile)[0] try: # Create necessary directories os.makedirs(output, exist_ok=True) os.makedirs(temp_dir, exist_ok=True) # Check if the GeoLite2 database exists, and download it if not if not os.path.exists(geolite_db_path): download_geolite_db(geolite_db_path) # Convert CSV to JSON (assuming convert_csv is a valid function that you have) json_file = convert_csv(auditfile, temp_dir) # Input and output file paths input_file = json_file db_name = os.path.join(temp_dir, 'audit_data.db') if rule_file is None: rule_file = 'O365_detection_rules.json' output_file = f"{output}_o365_report.xlsx" # Measure the start time # Flatten the JSON file flattened_df = flatten_json_file(input_file, timezone) # Create SQLite database from the flattened DataFrame create_sqlite_db_from_dataframe(flattened_df, db_name) # Open the GeoLite2 database with geoip2.database.Reader(geolite_db_path) as reader: # Resolve ClientIP to country names if 'ClientIP' in flattened_df.columns: flattened_df['Country'] = flattened_df['ClientIP'].apply(lambda ip: get_country_from_ip(ip, reader)) # Read detection rules rules = read_detection_rules(rule_file) # Apply detection logic using SQLite detected_events = apply_detection_logic_sqlite(db_name, rules) # Reorder columns to make RuleName the first column if not detected_events.empty: columns = ['RuleName', 'Severity'] + [col for col in detected_events.columns if col not in ['RuleName', 'Severity']] detected_events = detected_events[columns] # Perform the brute-force detection query conn = sqlite3.connect(db_name) try: user_login_tracker_df = pd.read_sql_query(user_logon_query, conn) password_spray_df = pd.read_sql_query(password_spray_query, conn) user_operations_df = pd.read_sql_query(User_operations_query, conn) user_operation_by_day_df = pd.read_sql_query(user_operation_by_day_query, conn) finally: conn.close() # Create a new workbook with the detection results with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer: if include_flattened_data: # Split the flattened data into multiple sheets if needed max_rows_per_sheet = 65000 num_sheets = len(flattened_df) // max_rows_per_sheet + 1 for i in range(num_sheets): start_row = i * max_rows_per_sheet end_row = (i + 1) * max_rows_per_sheet sheet_name = f'Flattened Data {i + 1}' flattened_df.iloc[start_row:end_row].to_excel(writer, sheet_name=sheet_name, index=False) # Write statistics for various fields detected_events.to_excel(writer, sheet_name='Detection Results', index=False) user_login_tracker_df.to_excel(writer, sheet_name='User Login Tracker', index=False) password_spray_df.to_excel(writer, sheet_name='Password Spray Attacks', index=False) user_operations_df.to_excel(writer, sheet_name='User Operations', index=False) user_operation_by_day_df.to_excel(writer, sheet_name='User Operations by Day', index=False) flattened_df['Operation'].value_counts().to_frame().to_excel(writer, sheet_name='Operation Stats') flattened_df['ClientIP'].value_counts().to_frame().to_excel(writer, sheet_name='ClientIP Stats') flattened_df['Country'].value_counts().to_frame().to_excel(writer, sheet_name='Country Stats') flattened_df['UserAgent'].value_counts().to_frame().to_excel(writer, sheet_name='UserAgent Stats') flattened_df['UserId'].value_counts().to_frame().to_excel(writer, sheet_name='UserId Stats') flattened_df['AuthenticationType'].value_counts().to_frame().to_excel(writer, sheet_name='AuthenticationType Stats') # Measure the end time end_time = time.time() print(f"Office365 analysis finished in time: {end_time - start_time:.2f} seconds") except Exception as e: print(f"An error occurred during the analysis: {e}") finally: #Clean up the temporary directory if os.path.exists(temp_dir): for file in Path(temp_dir).glob('*'): file.unlink() # Delete the file os.rmdir(temp_dir) # Remove the directory # Write the User Login Tracker results to a new sheet # Measure the end time end_time = time.time() # Calculate and print the running time running_time = end_time - start_time print(f"Office365 hunter finished in time: {running_time:.2f} seconds")