import fitz # PyMuPDF import pandas as pd # Load the PDF pdf_path = "/home/ok/Desktop/Diabetes/1-s2.0-S2405457724001633-mmc1.pdf" doc = fitz.open(pdf_path) # Extract all text from the PDF all_text = "" for page in doc: all_text += page.get_text() with open("all_text", "w", encoding="utf-8") as txt_file: txt_file.write(all_text) # Attempt to extract food names and insulin index (II) values import re # Match patterns where food/beverage name is followed by insulin index # Simplified pattern matching lines like "Number Food / Beverage Name ... II ..." # We'll look for lines that seem like: Number. Food name ... II value matches = re.findall(r"\d+\.\s+(.+?)\s+\d+SINGLE|MIXED MEAL\s+([\d.]+)", all_text) # Flatten and clean data data = [] for match in matches: name = match[0].strip() print(name) try: ii = float(match[1]) data.append((name, ii)) except ValueError: continue # Create and sort DataFrame df = pd.DataFrame(data, columns=["Food Name", "Insulin Index"]) df_sorted = df.sort_values(by="Insulin Index", ascending=False) # Save to CSV csv_path = "/home/ok/Desktop/Diabetes/insulin_index_sorted.csv" df_sorted.to_csv(csv_path, index=False) csv_path