import csv import re import os # Use relative paths for portability input_file_path = 'data/esg_corpus_original.csv' output_file_path = 'data/esg_corpus.csv' def sanitize_csv(input_path, output_path): """ Reads a malformed CSV file, cleans the markdown content, and writes a new, valid CSV file. """ try: # Ensure the output directory exists output_dir = os.path.dirname(output_path) if output_dir: os.makedirs(output_dir, exist_ok=True) with open(input_path, 'r', encoding='utf-8') as f: content = f.read() if '\n' not in content: print("File is empty or has only one line.") return # The header is the first line header, body = content.split('\n', 1) # Use regex to split records. A record starts with a number followed by a comma. # This handles cases where the markdown field contains newlines. records_raw = re.split(r'\n(?=\d+,)', body) with open(output_path, 'w', newline='', encoding='utf-8') as f_out: writer = csv.writer(f_out, quoting=csv.QUOTE_ALL) writer.writerow([h.strip() for h in header.strip().split(',')]) for raw_record in records_raw: if not raw_record.strip(): continue try: # Split only on the first two commas to separate index, filename, and markdown parts = raw_record.split(',', 2) if len(parts) < 3: print(f"Skipping malformed record: {raw_record[:100]}") continue index = parts[0].strip() filename = parts[1].strip() markdown = parts[2] # Clean up markdown content by removing leading/trailing quotes and whitespace if markdown.startswith('"'): markdown = markdown[1:] if markdown.endswith('"'): markdown = markdown[:-1] markdown = markdown.strip() writer.writerow([index, filename, markdown]) except IndexError: print(f"Skipping malformed record on split: {raw_record[:100]}") print(f"Sanitized data saved to {output_path}") except FileNotFoundError: print(f"Error: File not found at {input_path}. Make sure '{input_file_path}' is in your repository.") except Exception as e: print(f"An error occurred during sanitization: {e}") if __name__ == "__main__": sanitize_csv(input_file_path, output_file_path)