ProductQuery/app/services/file_service.py
2024-12-17 10:47:33 +01:00

49 lines
1.5 KiB
Python

# app/services/file_service.py
import os
import pdfplumber
class FileService:
"""
A service to handle file-related operations, including loading PDFs from a folder.
"""
def __init__(self, folder_path: str):
"""
Initialize the FileService with the folder path to read files from.
"""
self.folder_path = os.path.abspath(folder_path)
# print(f"[DEBUG] Initialized FileService with folder path: {self.folder_path}")
def load_pdfs(self):
"""
Reads all PDF files from the folder and returns their paths.
:return: List of paths to PDF files in the folder.
"""
if not os.path.exists(self.folder_path):
raise FileNotFoundError(f"The folder {self.folder_path} does not exist.")
pdf_files = [
os.path.join(self.folder_path, f)
for f in os.listdir(self.folder_path)
if f.endswith(".pdf")
]
if not pdf_files:
raise FileNotFoundError(f"No PDF files found in the folder {self.folder_path}.")
return pdf_files
def extract_text_from_pdf(self, pdf_path):
"""
Extracts text from the PDF file using pdfplumber.
:param pdf_path: Path to the PDF file.
:return: Extracted text as a string.
"""
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text