-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdocument_loader.py
45 lines (37 loc) · 1.31 KB
/
document_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from langchain_community.document_loaders import (
DirectoryLoader,
PyPDFLoader
)
import os
from typing import List
from langchain_core.documents import Document
def load_documents(path: str) -> List[Document]:
"""
Loads documents from the specified directory path.
This function supports loading of PDF, Markdown, and HTML documents by utilizing
different loaders for each file type. It checks if the provided path exists and
raises a FileNotFoundError if it does not. It then iterates over the supported
file types and uses the corresponding loader to load the documents into a list.
Args:
path (str): The path to the directory containing documents to load.
Returns:
List[Document]: A list of loaded documents.
Raises:
FileNotFoundError: If the specified path does not exist.
"""
if not os.path.exists(path):
raise FileNotFoundError(f"The specified path does not exist: {path}")
loaders = {
".pdf": DirectoryLoader(
path,
glob="**/*.pdf",
loader_cls=PyPDFLoader,
show_progress=True,
use_multithreading=True,
)
}
docs = []
for file_type, loader in loaders.items():
print(f"Loading {file_type} files")
docs.extend(loader.load())
return docs