import csv
import re
import os

# Use relative paths for portability
input_file_path = 'data/esg_corpus_original.csv'
output_file_path = 'data/esg_corpus.csv'

def sanitize_csv(input_path, output_path):
    """
    Reads a malformed CSV file, cleans the markdown content, and writes a new, valid CSV file.
    """
    try:
        # Ensure the output directory exists
        output_dir = os.path.dirname(output_path)
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)

        with open(input_path, 'r', encoding='utf-8') as f:
            content = f.read()

        if '\n' not in content:
            print("File is empty or has only one line.")
            return

        # The header is the first line
        header, body = content.split('\n', 1)

        # Use regex to split records. A record starts with a number followed by a comma.
        # This handles cases where the markdown field contains newlines.
        records_raw = re.split(r'\n(?=\d+,)', body)

        with open(output_path, 'w', newline='', encoding='utf-8') as f_out:
            writer = csv.writer(f_out, quoting=csv.QUOTE_ALL)
            writer.writerow([h.strip() for h in header.strip().split(',')])

            for raw_record in records_raw:
                if not raw_record.strip():
                    continue
                
                try:
                    # Split only on the first two commas to separate index, filename, and markdown
                    parts = raw_record.split(',', 2)
                    if len(parts) < 3:
                        print(f"Skipping malformed record: {raw_record[:100]}")
                        continue

                    index = parts[0].strip()
                    filename = parts[1].strip()
                    markdown = parts[2]

                    # Clean up markdown content by removing leading/trailing quotes and whitespace
                    if markdown.startswith('"'):
                        markdown = markdown[1:]
                    if markdown.endswith('"'):
                        markdown = markdown[:-1]
                    
                    markdown = markdown.strip()

                    writer.writerow([index, filename, markdown])
                except IndexError:
                    print(f"Skipping malformed record on split: {raw_record[:100]}")

        print(f"Sanitized data saved to {output_path}")

    except FileNotFoundError:
        print(f"Error: File not found at {input_path}. Make sure '{input_file_path}' is in your repository.")
    except Exception as e:
        print(f"An error occurred during sanitization: {e}")

if __name__ == "__main__":
    sanitize_csv(input_file_path, output_file_path)