This is the backend for the web app, all written in django utilizing text-davinci-003 to provide text completions based off web scraped data.
Demo:
Demo.mov
Main code:
Lines 1 to 110 in aac7b9e
import os | |
import json | |
from django.shortcuts import render | |
import openai | |
import numpy as np | |
import pandas as pd | |
from django.http import HttpResponse | |
from openai.embeddings_utils import distances_from_embeddings, cosine_similarity | |
from django.views.decorators.csrf import csrf_exempt | |
openai.api_key = os.environ.get("TOKEN") | |
# Create your views here. | |
@csrf_exempt | |
def prompt(request): # post /api/prompt | |
if request.method != 'POST': | |
data = {'res': 'Only POST requests allowed!'} | |
res = HttpResponse(content=json.dumps( | |
data), content_type='application/json') | |
return res | |
df=pd.read_csv('processed/embeddings.csv', index_col=0) | |
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) | |
df.head() | |
data = json.loads(request.body) | |
# Access the properties from the JSON data | |
prompt = data.get('prompt') | |
ans = answer_question(df, question=f"{prompt} Please show me a correct, valid link to a page that also answers my question in a separate paragraph titled Relevant Link(s). Remove any html syntax.", debug=False) | |
output = {'answer': ans} | |
res = HttpResponse(content=json.dumps( | |
output), content_type='application/json') | |
return res | |
def create_context( | |
question, df, max_len=1800, size="ada" | |
): | |
""" | |
Create a context for a question by finding the most similar context from the dataframe | |
""" | |
# Get the embeddings for the question | |
q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding'] | |
# Get the distances from the embeddings | |
df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine') | |
returns = [] | |
cur_len = 0 | |
# Sort by distance and add the text to the context until the context is too long | |
for i, row in df.sort_values('distances', ascending=True).iterrows(): | |
# Add the length of the text to the current length | |
cur_len += row['n_tokens'] + 4 | |
# If the context is too long, break | |
if cur_len > max_len: | |
break | |
# Else add it to the text that is being returned | |
returns.append(row["text"]) | |
# Return the context | |
return "\n\n###\n\n".join(returns) | |
def answer_question( | |
df, | |
model="text-davinci-003", | |
question="Am I allowed to publish model outputs to Twitter, without a human review?", | |
max_len=1800, | |
size="ada", | |
debug=False, | |
max_tokens=150, | |
stop_sequence=None | |
): | |
""" | |
Answer a question based on the most similar context from the dataframe texts | |
""" | |
context = create_context( | |
question, | |
df, | |
max_len=max_len, | |
size=size, | |
) | |
# If debug, print the raw model response | |
if debug: | |
print("Context:\n" + context) | |
print("\n\n") | |
try: | |
# Create a completions using the questin and context | |
response = openai.Completion.create( | |
prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"Sorry, I didn't understand that. Please rephrase.\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:", | |
temperature=0, | |
max_tokens=max_tokens, | |
top_p=1, | |
frequency_penalty=0, | |
presence_penalty=0, | |
stop=stop_sequence, | |
model=model, | |
) | |
return response["choices"][0]["text"].strip() | |
except Exception as e: | |
print(e) | |
return "" |
Lines 1 to 391 in 711e28b
################################################################################ | |
### Step 1 | |
################################################################################ | |
import requests | |
import re | |
import urllib.request | |
from bs4 import BeautifulSoup | |
from collections import deque | |
from html.parser import HTMLParser | |
from urllib.parse import urlparse | |
import os | |
import pandas as pd | |
import tiktoken | |
import openai | |
import numpy as np | |
from openai.embeddings_utils import distances_from_embeddings, cosine_similarity | |
openai.api_key = os.environ.get("TOKEN") | |
# Regex pattern to match a URL | |
HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$' | |
# Define root domain to crawl | |
domain = "sitefarm.ucdavis.edu" | |
full_url = "https://sitefarm.ucdavis.edu/" | |
# Create a class to parse the HTML and get the hyperlinks | |
# class HyperlinkParser(HTMLParser): | |
# def __init__(self): | |
# super().__init__() | |
# # Create a list to store the hyperlinks | |
# self.hyperlinks = [] | |
# # Override the HTMLParser's handle_starttag method to get the hyperlinks | |
# def handle_starttag(self, tag, attrs): | |
# attrs = dict(attrs) | |
# # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks | |
# if tag == "a" and "href" in attrs: | |
# self.hyperlinks.append(attrs["href"]) | |
# ################################################################################ | |
# ### Step 2 | |
# ################################################################################ | |
# # Function to get the hyperlinks from a URL | |
# def get_hyperlinks(url): | |
# # Try to open the URL and read the HTML | |
# try: | |
# # Open the URL and read the HTML | |
# with urllib.request.urlopen(url) as response: | |
# # If the response is not HTML, return an empty list | |
# if not response.info().get('Content-Type').startswith("text/html"): | |
# return [] | |
# # Decode the HTML | |
# html = response.read().decode('utf-8') | |
# except Exception as e: | |
# print(e) | |
# return [] | |
# # Create the HTML Parser and then Parse the HTML to get hyperlinks | |
# parser = HyperlinkParser() | |
# parser.feed(html) | |
# return parser.hyperlinks | |
# ################################################################################ | |
# ### Step 3 | |
# ################################################################################ | |
# # Function to get the hyperlinks from a URL that are within the same domain | |
# def get_domain_hyperlinks(local_domain, url): | |
# clean_links = [] | |
# for link in set(get_hyperlinks(url)): | |
# clean_link = None | |
# # If the link is a URL, check if it is within the same domain | |
# if re.search(HTTP_URL_PATTERN, link): | |
# # Parse the URL and check if the domain is the same | |
# url_obj = urlparse(link) | |
# if url_obj.netloc == local_domain: | |
# clean_link = link | |
# # If the link is not a URL, check if it is a relative link | |
# else: | |
# if link.startswith("/"): | |
# link = link[1:] | |
# elif ( | |
# link.startswith("#") | |
# or link.startswith("mailto:") | |
# or link.startswith("tel:") | |
# ): | |
# continue | |
# url_obj = urlparse(link) | |
# clean_link = "https://" + local_domain + "/" + link | |
# if clean_link is not None: | |
# if clean_link.endswith("/"): | |
# clean_link = clean_link[:-1] | |
# clean_links.append(clean_link) | |
# # Return the list of hyperlinks that are within the same domain | |
# return list(set(clean_links)) | |
# ################################################################################ | |
# ### Step 4 | |
# ################################################################################ | |
# def crawl(url): | |
# # Parse the URL and get the domain | |
# local_domain = urlparse(url).netloc | |
# # Create a queue to store the URLs to crawl | |
# queue = deque([url]) | |
# # Create a set to store the URLs that have already been seen (no duplicates) | |
# seen = set([url]) | |
# # Create a directory to store the text files | |
# if not os.path.exists("text/"): | |
# os.mkdir("text/") | |
# if not os.path.exists("text/"+local_domain+"/"): | |
# os.mkdir("text/" + local_domain + "/") | |
# # Create a directory to store the csv files | |
# if not os.path.exists("processed"): | |
# os.mkdir("processed") | |
# # While the queue is not empty, continue crawling | |
# while queue: | |
# # Get the next URL from the queue | |
# url = queue.pop() | |
# print(url) # for debugging and to see the progress | |
# # Save text from the url to a <url>.txt file | |
# with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f: | |
# # Get the text from the URL using BeautifulSoup | |
# soup = BeautifulSoup(requests.get(url).text, "html.parser") | |
# # Get the text but remove the tags | |
# text = soup.get_text() | |
# # If the crawler gets to a page that requires JavaScript, it will stop the crawl | |
# if ("You need to enable JavaScript to run this app." in text): | |
# print("Unable to parse page " + url + " due to JavaScript being required") | |
# # Otherwise, write the text to the file in the text directory | |
# f.write(text) | |
# # Get the hyperlinks from the URL and add them to the queue | |
# for link in get_domain_hyperlinks(local_domain, url): | |
# if link not in seen: | |
# queue.append(link) | |
# seen.add(link) | |
# crawl(full_url) | |
############################################################################### | |
## Step 5 | |
############################################################################### | |
# def remove_newlines(serie): | |
# serie = serie.str.replace('\n', ' ') | |
# serie = serie.str.replace('\\n', ' ') | |
# serie = serie.str.replace(' ', ' ') | |
# serie = serie.str.replace(' ', ' ') | |
# return serie | |
# ################################################################################ | |
# ### Step 6 | |
# ################################################################################ | |
# # Create a list to store the text files | |
# texts=[] | |
# # Get all the text files in the text directory | |
# for file in os.listdir("text/" + domain + "/"): | |
# # Open the file and read the text | |
# with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f: | |
# text = f.read() | |
# # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces. | |
# texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text)) | |
# # Create a dataframe from the list of texts | |
# df = pd.DataFrame(texts, columns = ['fname', 'text']) | |
# # Set the text column to be the raw text with the newlines removed | |
# df['text'] = df.fname + ". " + remove_newlines(df.text) | |
# df.to_csv('processed/scraped.csv') | |
# df.head() | |
# ################################################################################ | |
# ### Step 7 | |
# ################################################################################ | |
# # Load the cl100k_base tokenizer which is designed to work with the ada-002 model | |
# tokenizer = tiktoken.get_encoding("cl100k_base") | |
# df = pd.read_csv('processed/scraped.csv', index_col=0) | |
# df.columns = ['title', 'text'] | |
# # Tokenize the text and save the number of tokens to a new column | |
# df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) | |
# # Visualize the distribution of the number of tokens per row using a histogram | |
# df.n_tokens.hist() | |
# ################################################################################ | |
# ### Step 8 | |
# ################################################################################ | |
# max_tokens = 500 | |
# # Function to split the text into chunks of a maximum number of tokens | |
# def split_into_many(text, max_tokens = max_tokens): | |
# # Split the text into sentences | |
# sentences = text.split('. ') | |
# # Get the number of tokens for each sentence | |
# n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences] | |
# chunks = [] | |
# tokens_so_far = 0 | |
# chunk = [] | |
# # Loop through the sentences and tokens joined together in a tuple | |
# for sentence, token in zip(sentences, n_tokens): | |
# # If the number of tokens so far plus the number of tokens in the current sentence is greater | |
# # than the max number of tokens, then add the chunk to the list of chunks and reset | |
# # the chunk and tokens so far | |
# if tokens_so_far + token > max_tokens: | |
# chunks.append(". ".join(chunk) + ".") | |
# chunk = [] | |
# tokens_so_far = 0 | |
# # If the number of tokens in the current sentence is greater than the max number of | |
# # tokens, go to the next sentence | |
# if token > max_tokens: | |
# continue | |
# # Otherwise, add the sentence to the chunk and add the number of tokens to the total | |
# chunk.append(sentence) | |
# tokens_so_far += token + 1 | |
# # Add the last chunk to the list of chunks | |
# if chunk: | |
# chunks.append(". ".join(chunk) + ".") | |
# return chunks | |
# shortened = [] | |
# Loop through the dataframe | |
# for row in df.iterrows(): | |
# # If the text is None, go to the next row | |
# if row[1]['text'] is None: | |
# continue | |
# # If the number of tokens is greater than the max number of tokens, split the text into chunks | |
# if row[1]['n_tokens'] > max_tokens: | |
# shortened += split_into_many(row[1]['text']) | |
# # Otherwise, add the text to the list of shortened texts | |
# else: | |
# shortened.append( row[1]['text'] ) | |
################################################################################ | |
### Step 9 | |
################################################################################ | |
# df = pd.DataFrame(shortened, columns = ['text']) | |
# df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) | |
# df.n_tokens.hist() | |
################################################################################ | |
### Step 10 | |
################################################################################ | |
# Note that you may run into rate limit issues depending on how many files you try to embed | |
# Please check out our rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits | |
# df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding']) | |
# df.to_csv('processed/embeddings.csv') | |
# df.head() | |
# ################################################################################ | |
# ### Step 11 | |
# ################################################################################ | |
df=pd.read_csv('processed/embeddings.csv', index_col=0) | |
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) | |
df.head() | |
# ################################################################################ | |
# ### Step 12 | |
# ################################################################################ | |
def create_context( | |
question, df, max_len=1800, size="ada" | |
): | |
""" | |
Create a context for a question by finding the most similar context from the dataframe | |
""" | |
# Get the embeddings for the question | |
q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding'] | |
# Get the distances from the embeddings | |
df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine') | |
returns = [] | |
cur_len = 0 | |
# Sort by distance and add the text to the context until the context is too long | |
for i, row in df.sort_values('distances', ascending=True).iterrows(): | |
# Add the length of the text to the current length | |
cur_len += row['n_tokens'] + 4 | |
# If the context is too long, break | |
if cur_len > max_len: | |
break | |
# Else add it to the text that is being returned | |
returns.append(row["text"]) | |
# Return the context | |
return "\n\n###\n\n".join(returns) | |
def answer_question( | |
df, | |
model="text-davinci-003", | |
question="Am I allowed to publish model outputs to Twitter, without a human review?", | |
max_len=1800, | |
size="ada", | |
debug=False, | |
max_tokens=150, | |
stop_sequence=None | |
): | |
""" | |
Answer a question based on the most similar context from the dataframe texts | |
""" | |
context = create_context( | |
question, | |
df, | |
max_len=max_len, | |
size=size, | |
) | |
# If debug, print the raw model response | |
if debug: | |
print("Context:\n" + context) | |
print("\n\n") | |
try: | |
# Create a completions using the questin and context | |
response = openai.Completion.create( | |
prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"Sorry, I didn't understand that. Please rephrase.\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:", | |
temperature=0, | |
max_tokens=max_tokens, | |
top_p=1, | |
frequency_penalty=0, | |
presence_penalty=0, | |
stop=stop_sequence, | |
model=model, | |
) | |
return response["choices"][0]["text"].strip() | |
except Exception as e: | |
print(e) | |
return "" | |
################################################################################ | |
### Step 13 | |
################################################################################ | |
print(answer_question(df, question="How do I publish? Please show me a relevant training link pertaining to my question in another, separate paragraph titled Relevant Link(s). If my question only asks for a link or page, provide only a link. Remove any html syntax if present.", debug=False)) |