Spaces:
Runtime error
Runtime error
| from operator import itemgetter | |
| from collections import OrderedDict | |
| from typing import Dict, List, Iterator, Union, Tuple | |
| import re | |
| class TextExtractor: | |
| def __init__(self) -> None: | |
| pass | |
| def get_font_info(doc: Iterator, granularity=False) -> List[Tuple[str, int]]: | |
| """ | |
| Return a list containing the font sizes and their count number. | |
| Args: | |
| doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file. | |
| granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False. | |
| Raises: | |
| ValueError: Raises Value Error if there are no font detected | |
| Returns: | |
| List[Tuple[str, int]]: | |
| Font Counts: [('12.0', 266), ('16.020000457763672', 18), ('13.979999542236328', 7), ('7.019999980926514', 2)] | |
| """ | |
| styles = {} | |
| font_counts = {} | |
| for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]: | |
| identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size']) | |
| styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']} | |
| font_counts[identifier] = font_counts.get(identifier, 0) + 1 | |
| font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True) | |
| if not font_counts: | |
| raise ValueError("Zero discriminating fonts found!") | |
| return font_counts, styles | |
| def get_font_tags(font_counts, styles) -> Dict[int, str]: | |
| """ | |
| Return a dictionary of font sizes and their corresponding tags. | |
| Args: | |
| font_counts (List[Tuple[str, int]]): The font sizes as keys and their count as values | |
| styles (Dict[int, Dict[str, str]]): A style descriptioin of every font sizes. | |
| Returns: | |
| Dict[int, str]: Dictionary of the font sizes as keys and their tags as values. | |
| Example: {12.0: '<p>', 16.020000457763672: '<h1>', 13.979999542236328: '<h2>', 7.019999980926514: '<s4>'} | |
| """ | |
| p_size = styles[font_counts[0][0]]['size'] | |
| # sorting the font sizes high to low, so that we can append the right integer to each tag | |
| font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True) | |
| size_tag = {p_size: "<p>"} | |
| for i, size in enumerate(font_sizes): | |
| if size > p_size: | |
| size_tag[size] = f"<h{i+1}>" | |
| elif size < p_size: | |
| size_tag[size] = f"<s{i+1}>" | |
| return size_tag | |
| def assign_tags(doc, size_tag) -> List[str]: | |
| """ | |
| Scrapes headers & paragraphs from PDF and return texts with element tags. | |
| Args: | |
| doc (<class 'fitz.fitz.Document'>): PDF document to iterate through. | |
| size_tag (dict): Textual element tags for each size. | |
| Returns: | |
| list: Texts with pre-prended element tags | |
| Examples: ['<h1>Group Members: |', '<p>1. Stella Shania Mintara - 2301860596 | |
| | 2. David Samuel - 2301850304 | 3. Egivenia - 2301850134 | 4. Aurelius Va | |
| nnes Leander - 2301862102 | 5. Juanrico Alvaro - 2301847316 ||'] | |
| """ | |
| texts = [] | |
| previous_s = {} | |
| block_string = "" | |
| for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]: | |
| block_string = "" | |
| for l in b["lines"]: | |
| for s in l["spans"]: | |
| text = re.sub(r"[^\w\s]", '', s["text"]).strip() | |
| if text: | |
| if not previous_s: # First Span | |
| previous_s = s | |
| block_string = size_tag[s['size']] + s['text'] | |
| elif s['size'] == previous_s['size']: | |
| if not block_string or (block_string and all((c == "|") for c in block_string)): # New block | |
| block_string = size_tag[s['size']] + s['text'] | |
| else: # in the same block, so concatenate strings | |
| block_string += f" {s['text']}" | |
| else: | |
| texts.append(block_string) | |
| block_string = size_tag[s['size']] + s['text'] | |
| previous_s = s | |
| if block_string: | |
| block_string += "|" | |
| # if block_string: | |
| texts.append(block_string) | |
| return texts | |
| def get_slides(texts): | |
| """ | |
| Returns the tagged texts into a slide format dictionary where the page is the | |
| key and the value is a list contaning the component of that page. | |
| Args: | |
| texts (List[str]): PDF text with element tags. | |
| Returns: | |
| Dict: The text of the PDF seperated by the header 1 tags. | |
| Examples: {'Page 1': [('h1', 'Group Members:'), | |
| ['p', '1. Stella Shania Mintara - 2301860596 2. David Samuel - | |
| 2301850304 3. Egivenia - 2301850134 4. Aurelius Vannes Leander - | |
| 2301862102 5. | |
| Juanrico Alvaro - 2301847316']], | |
| 'Page 2': [('h1', 'Case Problem'), | |
| ['p', FreshMart is an established large-scale supermarket with branc | |
| hes in popular areas across Jakarta and big cities]]} | |
| """ | |
| slides = {} | |
| section = [] | |
| page = 1 | |
| current_header = "" | |
| for text, next_text in zip(texts, texts[1:] + [None]): | |
| tag_match = re.search(r'(?<=<)(.*?)(?=>)', text) | |
| if tag_match: | |
| tag = tag_match.group() | |
| if tag == 'h1': | |
| section = [] | |
| section.append(('h1', re.sub(r'<.*?>|\|', '', text).strip())) | |
| elif tag.startswith('h'): # non h1 headers | |
| # Remove tag and pipes from the text | |
| section.append((tag, re.sub(r'<.*?>|\|', '', text).strip())) | |
| elif tag.startswith('p'): | |
| text = re.split("((\|){2,})", text) # If encounter more than 1 pipe than split that text into different paragraphs | |
| for paragraph in text: | |
| paragraph = re.sub(r'<.*?>|\|', '', paragraph).strip() # Remove any pipe | |
| paragraph = re.sub(' +', ' ', paragraph) # Remove any double or more spaces into single space | |
| if paragraph and paragraph[0].islower(): # If a pargraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph | |
| section[-1][1] += f" {paragraph}" | |
| elif paragraph: | |
| section.append([tag, paragraph]) | |
| try: | |
| if tag_match.group() == 'h1': # Create new page when current text is a type 1 header or title | |
| slides[f"Page {page}"] = section | |
| page += 1 | |
| except: | |
| continue | |
| return slides | |