diff --git a/source/lib/O365Hunter.py b/source/lib/O365Hunter.py new file mode 100644 index 0000000..4de7600 --- /dev/null +++ b/source/lib/O365Hunter.py @@ -0,0 +1,321 @@ +import json +import sqlite3 +import tempfile +import os +import time +import pandas as pd +import geoip2.database +import requests +from dateutil import parser, tz +import pandas as pd +import json +import csv +from pathlib import Path + +start_time=0 +end_time=0 +password_spray_query = ''' + WITH FailedLogins AS ( + + SELECT + UserId, + ClientIP, + datetime(CreationTime) AS LoginDate + FROM + events + WHERE + Operation = 'UserLoginFailed' + +) +SELECT + UserId, + GROUP_CONCAT(ClientIP, ', ') AS ClientIPs, + COUNT(DISTINCT ClientIP) AS UniqueIPCount, + COUNT(*) AS FailedLoginAttempts, + LoginDate + +FROM + FailedLogins +GROUP BY + UserId, + strftime('%Y-%m-%d %H', LoginDate) +HAVING + COUNT(*) > 5 AND UniqueIPCount > 3 +ORDER BY + FailedLoginAttempts DESC; + ''' + +user_logon_query = ''' +SELECT + UserId, + date(CreationTime) AS LoginDate, + COUNT(*) AS TotalLoginAttempts, + SUM(CASE WHEN Operation = 'UserLoggedIn' THEN 1 ELSE 0 END) AS SuccessfulLogins, + SUM(CASE WHEN Operation = 'UserLoginFailed' THEN 1 ELSE 0 END) AS FailedLogins + FROM + events + where + Operation = 'UserLoggedIn' OR Operation = 'UserLoginFailed' + GROUP BY + UserId, + LoginDate + ORDER BY + LoginDate, + UserId; +''' + +User_operations_query = ''' +SELECT + UserId, + COUNT(DISTINCT Operation) AS OperationCount, + GROUP_CONCAT(Operation, ', ') AS UniqueOperations +FROM + (SELECT DISTINCT UserId, Operation FROM events) +GROUP BY + UserId +ORDER BY + OperationCount DESC; +''' + +user_operation_by_day_query = ''' +SELECT + UserId, + DATE(CreationTime) AS OperationDate, + COUNT(DISTINCT Operation) AS OperationCount, + GROUP_CONCAT( Operation, ', ') AS UniqueOperations +FROM + events +GROUP BY + UserId, + OperationDate +ORDER BY + OperationCount DESC +''' + + +def convert_csv(input_file,temp): + with open(input_file, 'r', encoding='utf-8') as csv_file: + # Create a CSV reader + reader = csv.DictReader(csv_file) + + json_file = 'audit_data.json' + json_file=os.path.join(temp, json_file) + with open(json_file, 'w', encoding='utf-8') as jsonl_file: + # Extract and write the AuditData column to a file as JSON Lines + for row in reader: + # Extract the AuditData which is already a JSON formatted string + json_data = json.loads(row['AuditData']) + # Convert the JSON object back to a string to store in the file + json_string = json.dumps(json_data) + # Write the JSON string to the file with a newline + jsonl_file.write(json_string + '\n') + + return json_file + + +def flatten_json_file(input_file, timezone, chunk_size=10000): + # Read the JSON file in chunks + chunks = [] + with open(input_file, 'r') as file: + lines = file.readlines() + for i in range(0, len(lines), chunk_size): + chunk = [json.loads(line) for line in lines[i:i + chunk_size]] + + # Convert the CreationTime to the desired timezone + for record in chunk: + if 'CreationTime' in record: + # Parse the CreationTime + creation_time = parser.parse(record['CreationTime']) + + # Check if the datetime object is timezone aware + if creation_time.tzinfo is None: + # Assume the original time is in UTC if no timezone info is present + creation_time = creation_time.replace(tzinfo=tz.tzutc()) + + # Convert the CreationTime to the desired timezone + record['CreationTime'] = creation_time.astimezone(timezone).isoformat() + + chunks.append(pd.json_normalize(chunk)) + + # Concatenate all chunks into a single DataFrame + flattened_records = pd.concat(chunks, ignore_index=True) + + return flattened_records + + +def create_sqlite_db_from_dataframe(dataframe, db_name): + conn = sqlite3.connect(db_name) + + # Convert all columns to string + dataframe = dataframe.astype(str) + + # Write the DataFrame to SQLite, treating all fields as text + dataframe.to_sql('events', conn, if_exists='replace', index=False, + dtype={col_name: 'TEXT' for col_name in dataframe.columns}) + + conn.close() + + +def read_detection_rules(rule_file): + with open(rule_file, 'r') as file: + rules = json.load(file) + return rules + + +def apply_detection_logic_sqlite(db_name, rules): + conn = sqlite3.connect(db_name) + all_detected_events = [] + + for rule in rules: + rule_name = rule['name'] + severity = rule['severity'] + query = rule['query'] + + detected_events = pd.read_sql_query(query, conn) + detected_events['RuleName'] = rule_name + detected_events['Severity'] = severity + + all_detected_events.append(detected_events) + + conn.close() + + if all_detected_events: + result = pd.concat(all_detected_events, ignore_index=True) + else: + result = pd.DataFrame() + + return result + +def download_geolite_db(geolite_db_path): + url = "https://git.io/GeoLite2-Country.mmdb" + print(f"Downloading GeoLite2 database from {url}...") + response = requests.get(url) + response.raise_for_status() # Check if the download was successful + + with open(geolite_db_path, 'wb') as file: + file.write(response.content) + print(f"GeoLite2 database downloaded and saved to {geolite_db_path}") + +def get_country_from_ip(ip, reader): + try: + response = reader.country(ip) + return response.country.name + except Exception as e: + #print(f"Could not resolve IP {ip}: {e}") + return 'Unknown' + + +def analyzeoff365(auditfile, rule_file, output, timezone, include_flattened_data=False, + geolite_db_path='GeoLite2-Country.mmdb'): + start_time = time.time() + temp_dir = ".temp" + if output is None or output == "": + output = os.path.splitext(auditfile)[0] + try: + # Create necessary directories + os.makedirs(output, exist_ok=True) + os.makedirs(temp_dir, exist_ok=True) + + # Check if the GeoLite2 database exists, and download it if not + if not os.path.exists(geolite_db_path): + download_geolite_db(geolite_db_path) + + # Convert CSV to JSON (assuming convert_csv is a valid function that you have) + json_file = convert_csv(auditfile, temp_dir) + + # Input and output file paths + input_file = json_file + db_name = os.path.join(temp_dir, 'audit_data.db') + + if rule_file is None: + rule_file = 'O365_detection_rules.json' + output_file = f"{output}_o365_report.xlsx" + + # Measure the start time + + + # Flatten the JSON file + flattened_df = flatten_json_file(input_file, timezone) + + # Create SQLite database from the flattened DataFrame + create_sqlite_db_from_dataframe(flattened_df, db_name) + + # Open the GeoLite2 database + with geoip2.database.Reader(geolite_db_path) as reader: + # Resolve ClientIP to country names + if 'ClientIP' in flattened_df.columns: + flattened_df['Country'] = flattened_df['ClientIP'].apply(lambda ip: get_country_from_ip(ip, reader)) + + # Read detection rules + rules = read_detection_rules(rule_file) + + # Apply detection logic using SQLite + detected_events = apply_detection_logic_sqlite(db_name, rules) + + # Reorder columns to make RuleName the first column + if not detected_events.empty: + columns = ['RuleName', 'Severity'] + [col for col in detected_events.columns if + col not in ['RuleName', 'Severity']] + detected_events = detected_events[columns] + + # Perform the brute-force detection query + conn = sqlite3.connect(db_name) + + try: + user_login_tracker_df = pd.read_sql_query(user_logon_query, conn) + password_spray_df = pd.read_sql_query(password_spray_query, conn) + user_operations_df = pd.read_sql_query(User_operations_query, conn) + user_operation_by_day_df = pd.read_sql_query(user_operation_by_day_query, conn) + finally: + conn.close() + + # Create a new workbook with the detection results + with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer: + if include_flattened_data: + # Split the flattened data into multiple sheets if needed + max_rows_per_sheet = 65000 + num_sheets = len(flattened_df) // max_rows_per_sheet + 1 + + for i in range(num_sheets): + start_row = i * max_rows_per_sheet + end_row = (i + 1) * max_rows_per_sheet + sheet_name = f'Flattened Data {i + 1}' + flattened_df.iloc[start_row:end_row].to_excel(writer, sheet_name=sheet_name, index=False) + + # Write statistics for various fields + detected_events.to_excel(writer, sheet_name='Detection Results', index=False) + user_login_tracker_df.to_excel(writer, sheet_name='User Login Tracker', index=False) + password_spray_df.to_excel(writer, sheet_name='Password Spray Attacks', index=False) + user_operations_df.to_excel(writer, sheet_name='User Operations', index=False) + user_operation_by_day_df.to_excel(writer, sheet_name='User Operations by Day', index=False) + flattened_df['Operation'].value_counts().to_frame().to_excel(writer, sheet_name='Operation Stats') + flattened_df['ClientIP'].value_counts().to_frame().to_excel(writer, sheet_name='ClientIP Stats') + flattened_df['Country'].value_counts().to_frame().to_excel(writer, sheet_name='Country Stats') + flattened_df['UserAgent'].value_counts().to_frame().to_excel(writer, sheet_name='UserAgent Stats') + flattened_df['UserId'].value_counts().to_frame().to_excel(writer, sheet_name='UserId Stats') + flattened_df['AuthenticationType'].value_counts().to_frame().to_excel(writer, + sheet_name='AuthenticationType Stats') + + # Measure the end time + end_time = time.time() + print(f"Office365 analysis finished in time: {end_time - start_time:.2f} seconds") + + except Exception as e: + print(f"An error occurred during the analysis: {e}") + + finally: + #Clean up the temporary directory + if os.path.exists(temp_dir): + for file in Path(temp_dir).glob('*'): + file.unlink() # Delete the file + os.rmdir(temp_dir) # Remove the directory + + + # Write the User Login Tracker results to a new sheet + + # Measure the end time + end_time = time.time() + + # Calculate and print the running time + running_time = end_time - start_time + print(f"Office365 hunter finished in time: {running_time:.2f} seconds")