| from langchain.prompts import PromptTemplate |
| from langchain_community.chat_models import ChatOpenAI |
| from langchain.chains import LLMChain |
| from PIL import Image |
| import os |
| from utils import load_json_file, str2time |
| from openai import OpenAI |
| import base64 |
|
|
| def get_smallest_timestamp(timestamps): |
| assert len(timestamps) > 0 |
|
|
| timestamps_in_ms = [str2time(elem) for elem in timestamps] |
|
|
| smallest_timestamp_in_ms = timestamps_in_ms[0] |
| smallest_timestamp = timestamps[0] |
| for i, elem in enumerate(timestamps_in_ms): |
| if elem < smallest_timestamp_in_ms: |
| smallest_timestamp_in_ms = elem |
| smallest_timestamp = timestamps[i] |
| return smallest_timestamp |
|
|
| def generate(query, context, relevant_timestamps=None): |
| prompt = PromptTemplate(input_variables=["question", "context"], template="You're a helpful LLM assistant in answering questions regarding a video. Given contexts are segments relevant to the question, please answer the question. Do not refer to segments. Context: {context}, question: {question} \nA:") |
|
|
| llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0) |
| chain = LLMChain(llm=llm, prompt=prompt) |
| response = chain.run(question=query, context=context) |
|
|
| if relevant_timestamps is not None and len(relevant_timestamps)>0: |
| |
| smallest_timestamp = get_smallest_timestamp(relevant_timestamps) |
| response += f' {smallest_timestamp}' |
| return response |
|
|
|
|
| def check_relevance(query, relevant_metadatas): |
| transcripts = [frame['transcript'] for frame in relevant_metadatas] |
| captions = [frame['caption'] for frame in relevant_metadatas] |
| timestamps = [frame['start_time'] for frame in relevant_metadatas] |
|
|
| context = "" |
| for i in range(len(transcripts)): |
| context += f"Segment {i}: transcript={transcripts[i]} caption={captions[i]}\n" |
| |
|
|
| prompt = PromptTemplate(input_variables=["question", "context"], template=""" |
| You are a grader assessing relevance of a retrieved video segment to a user question. \n |
| If the video segment contains keyword(s) or semantic meaning related to the question, grade it as relevant. \n |
| Give a binary score 'yes' or 'no' score to indicate whether the video segment is relevant to the question. \n |
| Answer in a string, separated by commas. For example: if there are segments provided, answer: yes,no,no,yes. \n |
| Question: {question} Context: {context}\n A:""") |
|
|
| |
| llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0) |
| chain = LLMChain(llm=llm, prompt=prompt) |
| response = chain.run(question=query, context=context) |
| |
|
|
| relevance_response = response.split(',') |
|
|
| actual_relevant_context = "" |
| relevant_timestamps = [] |
| for i, relevance_check in enumerate(relevance_response): |
| if relevance_check.strip() == 'yes': |
| actual_relevant_context += f"Segment {i}: transcript={transcripts[i]} caption={captions[i]}\n" |
| relevant_timestamps.append(timestamps[i]) |
| return actual_relevant_context, relevant_timestamps |
|
|
|
|
| def retrieve_segments_from_timestamp(metadatas, timestamps): |
| relevant_segments = [] |
|
|
| for timestamp in timestamps: |
| time_to_find_ms = str2time(timestamp) |
| buffer = 5000 |
|
|
| for segment in metadatas: |
| start = str2time(segment['start_time']) |
| end = str2time(segment['end_time']) |
| if start <= time_to_find_ms + buffer and end >= time_to_find_ms - buffer: |
| relevant_segments.append(segment) |
|
|
| return relevant_segments |
|
|
|
|
| def check_timestamps(query): |
| prompt = PromptTemplate(input_variables=["question"], template="You're a helpful LLM assistant. You're good at detecting any timestamps provided in a query. Please detect the question and timestamp in the the following question and separated them by commas such as question,timestamp1,timestamp2 if timestamps are provided else just question. Question: {question} \nA:") |
|
|
| llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0) |
| chain = LLMChain(llm=llm, prompt=prompt) |
| response = chain.run(question=query) |
|
|
| timestamps = [] |
| if len(response.split(',')) > 1: |
| query = response.split(',')[0].strip() |
| timestamps = [f"00:{elem.strip()}.00" for elem in response.split(',')[1:]] |
|
|
| return query, timestamps |
|
|
| def retrieve_by_embedding(index, video_path, query, text_model): |
| print(query) |
| query_embedding = text_model.encode(query) |
|
|
| res = index.query(vector=query_embedding.tolist(), top_k=5, filter={"video_path": {"$eq": video_path}} ) |
|
|
| metadatas = [] |
| for id, match_ in enumerate(res['matches']): |
| result = index.fetch(ids=[match_['id']]) |
| |
| |
| vector_data = result.vectors.get(match_['id'], {}) |
|
|
| |
| metadata = vector_data.metadata |
| metadatas.append(metadata) |
|
|
| return metadatas |
|
|
| def self_reflection(query, answer, summary): |
| prompt = PromptTemplate(input_variables=["summary", "question", "answer"], template="You're a helpful LLM assistant. You're good at determining if the provided answer is satisfactory to a question relating to a video. You have access to the video summary as follows: {summary}. Given a pair of question and answer, give the answer's satisfactory score in either yes or no. Question: {question}, Answer: {answer} \nA:") |
| |
| llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0) |
| chain = LLMChain(llm=llm, prompt=prompt) |
| response = chain.run(summary=summary, question=query, answer=answer) |
| return response |
|
|
|
|
| def get_full_transcript(metadatas): |
| |
| transcripts = [frame['transcript'] for frame in metadatas] |
|
|
| full_text = '' |
| for idx, transcript in enumerate(transcripts): |
| text = transcript.strip().replace(" ", " ") |
| full_text += f"{text} " |
|
|
| full_text = full_text.strip() |
| return full_text |
|
|
| def summarize_video(metadatas_path:str): |
| metadatas = load_json_file(metadatas_path) |
|
|
| |
| transcript = get_full_transcript(metadatas) |
| prompt = PromptTemplate(input_variables=["transcript"], template="You're a helpful LLM assistant. Please provide a summary for the video given its full transcript: {transcript} \nA:") |
| |
| llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0) |
| chain = LLMChain(llm=llm, prompt=prompt) |
| response = chain.run(transcript=transcript) |
| return response |
|
|
| def answer_wrt_timestamp(query, context): |
| prompt = PromptTemplate(input_variables=["question", "context"], template=""" |
| You're a helpful LLM assistant. Given a question and a timestamp, I have retrieved the relevant context as follows. Please answer the question using the information provided in the context. Question: {question}, context: {context} \n |
| For example: Question="What happens at 4:20?" Caption="a person is standing up" Transcript="I have to go" Appropriate Answer="At 4:20, a person is standing up and saying he has to go." |
| A:""") |
| llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0) |
| chain = LLMChain(llm=llm, prompt=prompt) |
| response = chain.run(question=query, context=context) |
| return response |
|
|
|
|
| def answer_question(index, model_stack, metadatas_path, video_summary:str, video_path:str, query:str, image_input_path:str=None): |
| metadatas = load_json_file(metadatas_path) |
| if image_input_path is not None: |
| return answer_image_question(index, model_stack, metadatas, video_summary, video_path, query, image_input_path) |
|
|
| |
| query, timestamps = check_timestamps(query) |
|
|
| if len(timestamps) > 0: |
| |
| relevant_segments_metadatas = retrieve_segments_from_timestamp(metadatas, timestamps) |
| transcripts = [frame['transcript'] for frame in relevant_segments_metadatas] |
| captions = [frame['caption'] for frame in relevant_segments_metadatas] |
| context = "" |
| for i in range(len(transcripts)): |
| context += f"Segment {i}: transcript={transcripts[i]} caption={captions[i]}\n" |
| |
| return answer_wrt_timestamp(query, context) |
| else: |
| |
| relevant_segments_metadatas = retrieve_by_embedding(index, video_path, query, model_stack[0]) |
|
|
| |
| actual_relevant_context, relevant_timestamps = check_relevance(query, relevant_segments_metadatas) |
| |
| |
|
|
| |
| answer = generate(query, actual_relevant_context, relevant_timestamps) |
| |
|
|
| |
| reflect = self_reflection(query, answer, video_summary) |
|
|
| |
| if reflect.lower() == 'no': |
| answer = generate(query, f"{actual_relevant_context}\nSummary={video_summary}") |
| |
| return answer |
|
|
| def retrieve_segments_by_image_embedding(index, video_path, model_stack, image_query_path): |
| image_query = Image.open(image_query_path) |
| _, vision_model, vision_model_processor, _, _ = model_stack |
| inputs = vision_model_processor(images=image_query, return_tensors="pt") |
| outputs = vision_model(**inputs) |
| image_query_embeds = outputs.pooler_output |
|
|
| res = index.query(vector=image_query_embeds.tolist(), top_k=5, filter={"video_path": {"$eq": video_path}} ) |
| |
| metadatas = [] |
| for id_, match_ in enumerate(res['matches']): |
| result = index.fetch(ids=[match_['id']]) |
| |
| |
| vector_data = result.vectors.get(match_['id'], {}) |
|
|
| |
| metadata = vector_data.metadata |
| metadatas.append(metadata) |
|
|
| return metadatas |
|
|
|
|
| def answer_image_question(index, model_stack, metadatas, video_summary:str, video_path:str, query:str, image_query_path:str=None): |
| |
| relevant_segments = retrieve_segments_by_image_embedding(index, video_path, model_stack, image_query_path) |
|
|
| |
| return generate_w_image(query, image_query_path, relevant_segments) |
|
|
|
|
| def encode_image(image_path): |
| with open(image_path, "rb") as image_file: |
| return base64.b64encode(image_file.read()).decode("utf-8") |
|
|
|
|
| def generate_w_image(query:str, image_query_path:str, relevant_metadatas): |
| base64_image = encode_image(image_query_path) |
| transcripts = [frame['transcript'] for frame in relevant_metadatas] |
| captions = [frame['caption'] for frame in relevant_metadatas] |
| |
|
|
| context = "" |
| for i in range(len(transcripts)): |
| context += f"Segment {i}: transcript={transcripts[i]} caption={captions[i]}\n" |
| |
|
|
| client = OpenAI() |
| response = client.chat.completions.create( |
| model="gpt-4o-mini", |
| messages=[ |
| {"role": "user", "content": [ |
| {"type": "text", "text": f"Here is some context about the image: {context}"}, |
| {"type": "text", "text": "You are a helpful LLM assistant. You are good at answering questions about a video given an image. Given the context surrounding the frames most correlated with the image and image, please answer the question. Question: {query}"}, |
| {"type": "image_url", "image_url": { |
| "url": f"data:image/png;base64,{base64_image}" |
| } |
| } |
| ]} |
| ], |
| temperature=0.0, |
| max_tokens=100, |
| ) |
|
|
| response = response.choices[0].message.content |
| |
| return response |