Spaces:
Build error
Build error
File size: 5,312 Bytes
8c3af37 f70a1e8 8c3af37 d3c2418 8c3af37 9bf13a9 8c3af37 9bf13a9 8c3af37 9bf13a9 8c3af37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | """TEXT SUMMARIZATION Web APP"""
# Importing Packages
import base64
import streamlit as st
import torch
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
# Load the tokenizer and model
checkpoint = 'Lamini-1'
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.float32)
# File Loader & Processing
def file_processing(file):
loader = PyPDFLoader(file)
pages = loader.load_and_split()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
texts = text_splitter.split_documents(pages)
final_texts = ""
for text in texts:
print(text)
final_texts = final_texts + text.page_content
return final_texts
# Language Model Pipeline -> Summarization
def llm_pipeline(filepath, summary_length):
pipe_summ = pipeline(
"summarization",
model=base_model, # T5ForConditionalGeneration.from_pretrained(checkpoint),
tokenizer=tokenizer, # T5Tokenizer.from_pretrained(checkpoint),
max_length=summary_length,
min_length=50,
)
input = file_processing(filepath)
result = pipe_summ(input)
result = result[0]["summary_text"]
return result
# Streamlit Code
st.set_page_config(layout="wide")
# Display Background
def add_bg_from_local(image_file):
with open(image_file, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
st.markdown(
f"""
<style>
.stApp {{
background-image: url(data:image/{"png"};base64,{encoded_string.decode()});
background-size: cover;
opacity:0.9;
}}
</style>
""",
unsafe_allow_html=True,
)
add_bg_from_local("Images/background2.jpg")
# Font Style
with open("font.css") as f:
st.markdown("<style>{}</style>".format(f.read()), unsafe_allow_html=True)
# Sidebar
st.sidebar.image("Images/sidebar_pic2.png")
st.sidebar.title("ABOUT THE APP")
st.sidebar.write(
"SummaScribe: Your PDF wingman! 🚀 Unleash the power of Streamlit and LangChain to transform boring text PDFs into "
"snappy summaries. Lightning-fast processing,ninja-level NLP algorithms, and a touch of magic—making info "
"extraction a breeze!"
)
selected_summary_length = st.sidebar.slider("SELECT SUMMARY STRENGTH", min_value=50, max_value=1000,
value=500)
# Display pdf of a given file
@st.cache_data
def display(file):
# Opening file from filepath
with open(file, "rb") as f:
base64_pdf = base64.b64encode(f.read()).decode("utf-8")
# Embedding pdf in html
display_pdf = (
f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="500" '
f'type="application/pdf"></iframe>'
)
# Displaying File
st.markdown(display_pdf, unsafe_allow_html=True)
# Main content
st.markdown(
"""
<style>
.summascribe-title {
font-size: 57px;
text-align: center;
transition: transform 0.2s ease-in-out;
}
.summascribe-title span {
transition: color 0.2s ease-in-out;
}
.summascribe-title:hover span {
color: #f5fefd; /* Hover color */
}
.summascribe-title:hover {
transform: scale(1.15);
}
</style>
""",
unsafe_allow_html=True,
)
text = "SummaScribe" # Text to be styled
colored_text = ''.join(
['<span style="color: hsl(220, 20%, {}%);">{}</span>'.format(70 - (i * 10 / len(text)), char) for i, char in
enumerate(text)])
colored_text_with_malt = colored_text + ' <span style="color: hsl(220, 20%, 70%);">✧</span>'
st.markdown(f'<h1 class="summascribe-title">{colored_text_with_malt}</h1>', unsafe_allow_html=True)
st.markdown(
'<h2 style="font-size:30px;color: #F5FEFD; text-align: center;">Text Document Summarization using LLMs</h2>',
unsafe_allow_html=True,
)
# Your Streamlit app content here...
def main():
# st.title("SUMMASCRIBE")
# st.subheader("Text Document Summarization using Large Language Models")
uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
with st.expander("NOTE"):
st.write(
"Summascribe currently accepts PDF documents that contain only text and no images. This limitation is due "
"to our app's current focus on leveraging advanced natural language processing (NLP) algorithms to "
"extract key information from textual content."
)
if uploaded_file is not None:
if st.button("Summarize"):
col1, col2 = st.columns((1, 1))
filepath = "data/" + uploaded_file.name
with open(filepath, "wb") as temp_file:
temp_file.write(uploaded_file.read())
with col1:
st.info("Uploaded File")
display(filepath)
with col2:
st.spinner(text="In progress...")
st.info("Summary")
summary = llm_pipeline(filepath, selected_summary_length)
st.success(summary, icon="✅")
if __name__ == "__main__":
main()
|