File size: 5,312 Bytes
8c3af37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f70a1e8
8c3af37
 
 
 
 
 
d3c2418
8c3af37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bf13a9
8c3af37
9bf13a9
8c3af37
 
9bf13a9
8c3af37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""TEXT SUMMARIZATION Web APP"""

# Importing Packages
import base64
import streamlit as st
import torch
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline

# Load the tokenizer and model
checkpoint = 'Lamini-1'
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map="auto", torch_dtype=torch.float32)


# File Loader & Processing
def file_processing(file):
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    texts = text_splitter.split_documents(pages)
    final_texts = ""
    for text in texts:
        print(text)
        final_texts = final_texts + text.page_content
    return final_texts


# Language Model Pipeline -> Summarization
def llm_pipeline(filepath, summary_length):
    pipe_summ = pipeline(
        "summarization",
        model=base_model,  # T5ForConditionalGeneration.from_pretrained(checkpoint),
        tokenizer=tokenizer,  # T5Tokenizer.from_pretrained(checkpoint),
        max_length=summary_length,
        min_length=50,
    )
    input = file_processing(filepath)
    result = pipe_summ(input)
    result = result[0]["summary_text"]
    return result


# Streamlit Code
st.set_page_config(layout="wide")


# Display Background
def add_bg_from_local(image_file):
    with open(image_file, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    st.markdown(
        f"""
    <style>
    .stApp {{
        background-image: url(data:image/{"png"};base64,{encoded_string.decode()});
        background-size: cover;
        opacity:0.9;
    }}
    </style>
    """,
        unsafe_allow_html=True,
    )


add_bg_from_local("Images/background2.jpg")

# Font Style
with open("font.css") as f:
    st.markdown("<style>{}</style>".format(f.read()), unsafe_allow_html=True)

# Sidebar
st.sidebar.image("Images/sidebar_pic2.png")
st.sidebar.title("ABOUT THE APP")
st.sidebar.write(
    "SummaScribe: Your PDF wingman! 🚀 Unleash the power of Streamlit and LangChain to transform boring text PDFs into "
    "snappy summaries. Lightning-fast processing,ninja-level NLP algorithms, and a touch of magic—making info "
    "extraction a breeze!"
)
selected_summary_length = st.sidebar.slider("SELECT SUMMARY STRENGTH", min_value=50, max_value=1000,
                                            value=500)


# Display pdf of a given file
@st.cache_data
def display(file):
    # Opening file from filepath
    with open(file, "rb") as f:
        base64_pdf = base64.b64encode(f.read()).decode("utf-8")
    # Embedding pdf in html
    display_pdf = (
        f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="500" '
        f'type="application/pdf"></iframe>'
    )
    # Displaying File
    st.markdown(display_pdf, unsafe_allow_html=True)


# Main content
st.markdown(
    """
    <style>
    .summascribe-title {
        font-size: 57px;
        text-align: center;
        transition: transform 0.2s ease-in-out;
    }
    .summascribe-title span {
        transition: color 0.2s ease-in-out;
    }
    .summascribe-title:hover span {
        color: #f5fefd; /* Hover color */
    }
    .summascribe-title:hover {
        transform: scale(1.15);
    }
    </style>
    """,
    unsafe_allow_html=True,
)

text = "SummaScribe"  # Text to be styled
colored_text = ''.join(
    ['<span style="color: hsl(220, 20%, {}%);">{}</span>'.format(70 - (i * 10 / len(text)), char) for i, char in
     enumerate(text)])
colored_text_with_malt = colored_text + ' <span style="color: hsl(220, 20%, 70%);">&#x2727;</span>'
st.markdown(f'<h1 class="summascribe-title">{colored_text_with_malt}</h1>', unsafe_allow_html=True)


st.markdown(
    '<h2 style="font-size:30px;color: #F5FEFD; text-align: center;">Text Document Summarization using LLMs</h2>',
    unsafe_allow_html=True,
)


# Your Streamlit app content here...
def main():
    # st.title("SUMMASCRIBE")
    # st.subheader("Text Document Summarization using Large Language Models")
    uploaded_file = st.file_uploader("Upload PDF file", type=["pdf"])
    with st.expander("NOTE"):
        st.write(
            "Summascribe currently accepts PDF documents that contain only text and no images. This limitation is due "
            "to our app's current focus on leveraging advanced natural language processing (NLP) algorithms to "
            "extract key information from textual content."
        )
    if uploaded_file is not None:
        if st.button("Summarize"):
            col1, col2 = st.columns((1, 1))
            filepath = "data/" + uploaded_file.name
            with open(filepath, "wb") as temp_file:
                temp_file.write(uploaded_file.read())
            with col1:
                st.info("Uploaded File")
                display(filepath)
            with col2:
                st.spinner(text="In progress...")
                st.info("Summary")
                summary = llm_pipeline(filepath, selected_summary_length)
                st.success(summary, icon="✅")


if __name__ == "__main__":
    main()