Akemi

Streamlit+langchian搭建可视化PDF大模型分析应用

2025/12/26

streamlit是使用python生成的交互式的web应用

requirements文件:https://ws-blog-img.oss-cn-hangzhou.aliyuncs.com/wangsheng/langchain-requirements.txt

需要准备llm的api key,我这里使用load_dotenv()从.env中加载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
pip install streamlit streamlit-extras

import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_deepseek import ChatDeepSeek
from langchain.chains.question_answering import load_qa_chain
import pickle
import os
from dotenv import load_dotenv
load_dotenv()

# 1.创建UI
with st.sidebar:
st.title("Ask for PDF")
st.markdown("该项目提供PDF文件上传的功能,用户可以输入问题,在PDF文件中进行搜索")
add_vertical_space(10) # 空五行
st.write("测试项目")

# 写一个main作为程序入口
def main():
st.header("Ask for PDF")

# 文件上传器
pdf = st.file_uploader("上传PDF", type="pdf")
if pdf is not None:
st.write(f">{pdf.name}上传完成")
# 2.读取pdf
pdf_reader = PdfReader(pdf)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
# st.write(text)

# 3.文本切割
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20, length_function=len)
texts = text_splitter.split_text(text)
st.write(texts)

# 4.向量初始化
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2",
model_kwargs={'device': 'cpu'}
)
st.write(">嵌入式模型初始化完成")
store_name = pdf.name[:-4]
# 如果数据库文件已存在,直接使用
if os.path.exists(f"{store_name}.pkl"):
with open(f"{store_name}.pkl", "rb") as f:
vectorstore = pickle.load(f)
st.write(">数据库已从磁盘加载")
else:
vectorstore = FAISS.from_texts(texts, embeddings)
# 数据库保存
with open(f"{store_name}.pkl","wb") as f:
pickle.dump(vectorstore, f)
st.write(f">数据库已保存为 {store_name}.pkl")

# 5.用户输入问题,检索数据库
query = st.text_input("输入问题:")
if query:
docs = vectorstore.similarity_search(query,k=30)
llm = ChatDeepSeek(model="deepseek-chat", temperature=0.7)
chain = load_qa_chain(llm, chain_type="stuff",)
reponse = chain.invoke({"question": query,"input_documents": docs})
st.write(f">检索结果:{reponse['output_text']}")

if __name__ == '__main__':
main()

#启动应用
streamlit run app.py


CATALOG