49 lines
1.5 KiB
Python
49 lines
1.5 KiB
Python
# app/services/file_service.py
|
|
|
|
import os
|
|
import pdfplumber
|
|
|
|
class FileService:
|
|
"""
|
|
A service to handle file-related operations, including loading PDFs from a folder.
|
|
"""
|
|
def __init__(self, folder_path: str):
|
|
"""
|
|
Initialize the FileService with the folder path to read files from.
|
|
"""
|
|
self.folder_path = os.path.abspath(folder_path)
|
|
# print(f"[DEBUG] Initialized FileService with folder path: {self.folder_path}")
|
|
|
|
def load_pdfs(self):
|
|
"""
|
|
Reads all PDF files from the folder and returns their paths.
|
|
|
|
:return: List of paths to PDF files in the folder.
|
|
"""
|
|
if not os.path.exists(self.folder_path):
|
|
raise FileNotFoundError(f"The folder {self.folder_path} does not exist.")
|
|
|
|
pdf_files = [
|
|
os.path.join(self.folder_path, f)
|
|
for f in os.listdir(self.folder_path)
|
|
if f.endswith(".pdf")
|
|
]
|
|
|
|
if not pdf_files:
|
|
raise FileNotFoundError(f"No PDF files found in the folder {self.folder_path}.")
|
|
|
|
return pdf_files
|
|
|
|
def extract_text_from_pdf(self, pdf_path):
|
|
"""
|
|
Extracts text from the PDF file using pdfplumber.
|
|
:param pdf_path: Path to the PDF file.
|
|
:return: Extracted text as a string.
|
|
"""
|
|
text = ""
|
|
with pdfplumber.open(pdf_path) as pdf:
|
|
for page in pdf.pages:
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
text += page_text + "\n"
|
|
return text |