berkeley/calhacks summer 2023 backend django app

This is the backend for the web app, all written in django utilizing text-davinci-003 to provide text completions based off web scraped data.

Demo:

Demo.mov

Main code:

berkeley/api/views.py

Lines 1 to 110 in aac7b9e

    
           import os 
        
           import json 
        
           from django.shortcuts import render 
        
           import openai 
        
           import numpy as np 
        
           import pandas as pd 
        
           from django.http import HttpResponse 
        
           from openai.embeddings_utils import distances_from_embeddings, cosine_similarity 
        
           from django.views.decorators.csrf import csrf_exempt 
        
           openai.api_key = os.environ.get("TOKEN") 
        
           # Create your views here. 
        
           @csrf_exempt 
        
           def prompt(request): # post /api/prompt 
        
               if request.method != 'POST': 
        
                   data = {'res': 'Only POST requests allowed!'} 
        
                   res = HttpResponse(content=json.dumps( 
        
                       data), content_type='application/json') 
        
                   return res 
        
               df=pd.read_csv('processed/embeddings.csv', index_col=0) 
        
               df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) 
        
               df.head() 
        
               data = json.loads(request.body) 
        
               # Access the properties from the JSON data 
        
               prompt = data.get('prompt') 
        
               ans = answer_question(df, question=f"{prompt} Please show me a correct, valid link to a page that also answers my question in a separate paragraph titled Relevant Link(s). Remove any html syntax.", debug=False) 
        
               output = {'answer': ans} 
        
               res = HttpResponse(content=json.dumps( 
        
                       output), content_type='application/json') 
        
               return res 
        
           def create_context( 
        
               question, df, max_len=1800, size="ada" 
        
           ): 
        
               """ 
        
               Create a context for a question by finding the most similar context from the dataframe 
        
               """ 
        
               # Get the embeddings for the question 
        
               q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding'] 
        
               # Get the distances from the embeddings 
        
               df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine') 
        
               returns = [] 
        
               cur_len = 0 
        
               # Sort by distance and add the text to the context until the context is too long 
        
               for i, row in df.sort_values('distances', ascending=True).iterrows(): 
        
                   # Add the length of the text to the current length 
        
                   cur_len += row['n_tokens'] + 4 
        
                   # If the context is too long, break 
        
                   if cur_len > max_len: 
        
                       break 
        
                   # Else add it to the text that is being returned 
        
                   returns.append(row["text"]) 
        
               # Return the context 
        
               return "\n\n###\n\n".join(returns) 
        
           def answer_question( 
        
               df, 
        
               model="text-davinci-003", 
        
               question="Am I allowed to publish model outputs to Twitter, without a human review?", 
        
               max_len=1800, 
        
               size="ada", 
        
               debug=False, 
        
               max_tokens=150, 
        
               stop_sequence=None 
        
           ): 
        
               """ 
        
               Answer a question based on the most similar context from the dataframe texts 
        
               """ 
        
               context = create_context( 
        
                   question, 
        
                   df, 
        
                   max_len=max_len, 
        
                   size=size, 
        
               ) 
        
               # If debug, print the raw model response 
        
               if debug: 
        
                   print("Context:\n" + context) 
        
                   print("\n\n") 
        
               try: 
        
                   # Create a completions using the questin and context 
        
                   response = openai.Completion.create( 
        
                       prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"Sorry, I didn't understand that. Please rephrase.\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:", 
        
                       temperature=0, 
        
                       max_tokens=max_tokens, 
        
                       top_p=1, 
        
                       frequency_penalty=0, 
        
                       presence_penalty=0, 
        
                       stop=stop_sequence, 
        
                       model=model, 
        
                   ) 
        
                   return response["choices"][0]["text"].strip() 
        
               except Exception as e: 
        
                   print(e) 
        
                   return ""

berkeley/scraper.py

Lines 1 to 391 in 711e28b

    
           ################################################################################ 
        
           ### Step 1 
        
           ################################################################################ 
        
           import requests 
        
           import re 
        
           import urllib.request 
        
           from bs4 import BeautifulSoup 
        
           from collections import deque 
        
           from html.parser import HTMLParser 
        
           from urllib.parse import urlparse 
        
           import os 
        
           import pandas as pd 
        
           import tiktoken 
        
           import openai 
        
           import numpy as np 
        
           from openai.embeddings_utils import distances_from_embeddings, cosine_similarity 
        
           openai.api_key = os.environ.get("TOKEN") 
        
           # Regex pattern to match a URL 
        
           HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$' 
        
           # Define root domain to crawl 
        
           domain = "sitefarm.ucdavis.edu" 
        
           full_url = "https://sitefarm.ucdavis.edu/" 
        
           # Create a class to parse the HTML and get the hyperlinks 
        
           # class HyperlinkParser(HTMLParser): 
        
           #     def __init__(self): 
        
           #         super().__init__() 
        
           #         # Create a list to store the hyperlinks 
        
           #         self.hyperlinks = [] 
        
           #     # Override the HTMLParser's handle_starttag method to get the hyperlinks 
        
           #     def handle_starttag(self, tag, attrs): 
        
           #         attrs = dict(attrs) 
        
           #         # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks 
        
           #         if tag == "a" and "href" in attrs: 
        
           #             self.hyperlinks.append(attrs["href"]) 
        
           # ################################################################################ 
        
           # ### Step 2 
        
           # ################################################################################ 
        
           # # Function to get the hyperlinks from a URL 
        
           # def get_hyperlinks(url): 
        
           #     # Try to open the URL and read the HTML 
        
           #     try: 
        
           #         # Open the URL and read the HTML 
        
           #         with urllib.request.urlopen(url) as response: 
        
           #             # If the response is not HTML, return an empty list 
        
           #             if not response.info().get('Content-Type').startswith("text/html"): 
        
           #                 return [] 
        
           #             # Decode the HTML 
        
           #             html = response.read().decode('utf-8') 
        
           #     except Exception as e: 
        
           #         print(e) 
        
           #         return [] 
        
           #     # Create the HTML Parser and then Parse the HTML to get hyperlinks 
        
           #     parser = HyperlinkParser() 
        
           #     parser.feed(html) 
        
           #     return parser.hyperlinks 
        
           # ################################################################################ 
        
           # ### Step 3 
        
           # ################################################################################ 
        
           # # Function to get the hyperlinks from a URL that are within the same domain 
        
           # def get_domain_hyperlinks(local_domain, url): 
        
           #     clean_links = [] 
        
           #     for link in set(get_hyperlinks(url)): 
        
           #         clean_link = None 
        
           #         # If the link is a URL, check if it is within the same domain 
        
           #         if re.search(HTTP_URL_PATTERN, link): 
        
           #             # Parse the URL and check if the domain is the same 
        
           #             url_obj = urlparse(link) 
        
           #             if url_obj.netloc == local_domain: 
        
           #                 clean_link = link 
        
           #         # If the link is not a URL, check if it is a relative link 
        
           #         else: 
        
           #             if link.startswith("/"): 
        
           #                 link = link[1:] 
        
           #             elif ( 
        
           #                 link.startswith("#") 
        
           #                 or link.startswith("mailto:") 
        
           #                 or link.startswith("tel:") 
        
           #             ): 
        
           #                 continue 
        
           #             url_obj = urlparse(link) 
        
           #             clean_link = "https://" + local_domain + "/" + link 
        
           #         if clean_link is not None: 
        
           #             if clean_link.endswith("/"): 
        
           #                 clean_link = clean_link[:-1] 
        
           #             clean_links.append(clean_link) 
        
           #     # Return the list of hyperlinks that are within the same domain 
        
           #     return list(set(clean_links)) 
        
           # ################################################################################ 
        
           # ### Step 4 
        
           # ################################################################################ 
        
           # def crawl(url): 
        
           #     # Parse the URL and get the domain 
        
           #     local_domain = urlparse(url).netloc 
        
           #     # Create a queue to store the URLs to crawl 
        
           #     queue = deque([url]) 
        
           #     # Create a set to store the URLs that have already been seen (no duplicates) 
        
           #     seen = set([url]) 
        
           #     # Create a directory to store the text files 
        
           #     if not os.path.exists("text/"): 
        
           #             os.mkdir("text/") 
        
           #     if not os.path.exists("text/"+local_domain+"/"): 
        
           #             os.mkdir("text/" + local_domain + "/") 
        
           #     # Create a directory to store the csv files 
        
           #     if not os.path.exists("processed"): 
        
           #             os.mkdir("processed") 
        
           #     # While the queue is not empty, continue crawling 
        
           #     while queue: 
        
           #         # Get the next URL from the queue 
        
           #         url = queue.pop() 
        
           #         print(url) # for debugging and to see the progress 
        
           #         # Save text from the url to a <url>.txt file 
        
           #         with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f: 
        
           #             # Get the text from the URL using BeautifulSoup 
        
           #             soup = BeautifulSoup(requests.get(url).text, "html.parser") 
        
           #             # Get the text but remove the tags 
        
           #             text = soup.get_text() 
        
           #             # If the crawler gets to a page that requires JavaScript, it will stop the crawl 
        
           #             if ("You need to enable JavaScript to run this app." in text): 
        
           #                 print("Unable to parse page " + url + " due to JavaScript being required") 
        
           #             # Otherwise, write the text to the file in the text directory 
        
           #             f.write(text) 
        
           #         # Get the hyperlinks from the URL and add them to the queue 
        
           #         for link in get_domain_hyperlinks(local_domain, url): 
        
           #             if link not in seen: 
        
           #                 queue.append(link) 
        
           #                 seen.add(link) 
        
           # crawl(full_url) 
        
           ############################################################################### 
        
           ## Step 5 
        
           ############################################################################### 
        
           # def remove_newlines(serie): 
        
           #     serie = serie.str.replace('\n', ' ') 
        
           #     serie = serie.str.replace('\\n', ' ') 
        
           #     serie = serie.str.replace('  ', ' ') 
        
           #     serie = serie.str.replace('  ', ' ') 
        
           #     return serie 
        
           # ################################################################################ 
        
           # ### Step 6 
        
           # ################################################################################ 
        
           # # Create a list to store the text files 
        
           # texts=[] 
        
           # # Get all the text files in the text directory 
        
           # for file in os.listdir("text/" + domain + "/"): 
        
           #     # Open the file and read the text 
        
           #     with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f: 
        
           #         text = f.read() 
        
           #         # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces. 
        
           #         texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text)) 
        
           # # Create a dataframe from the list of texts 
        
           # df = pd.DataFrame(texts, columns = ['fname', 'text']) 
        
           # # Set the text column to be the raw text with the newlines removed 
        
           # df['text'] = df.fname + ". " + remove_newlines(df.text) 
        
           # df.to_csv('processed/scraped.csv') 
        
           # df.head() 
        
           # ################################################################################ 
        
           # ### Step 7 
        
           # ################################################################################ 
        
           # # Load the cl100k_base tokenizer which is designed to work with the ada-002 model 
        
           # tokenizer = tiktoken.get_encoding("cl100k_base") 
        
           # df = pd.read_csv('processed/scraped.csv', index_col=0) 
        
           # df.columns = ['title', 'text'] 
        
           # # Tokenize the text and save the number of tokens to a new column 
        
           # df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) 
        
           # # Visualize the distribution of the number of tokens per row using a histogram 
        
           # df.n_tokens.hist() 
        
           # ################################################################################ 
        
           # ### Step 8 
        
           # ################################################################################ 
        
           # max_tokens = 500 
        
           # # Function to split the text into chunks of a maximum number of tokens 
        
           # def split_into_many(text, max_tokens = max_tokens): 
        
           #     # Split the text into sentences 
        
           #     sentences = text.split('. ') 
        
           #     # Get the number of tokens for each sentence 
        
           #     n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences] 
        
           #     chunks = [] 
        
           #     tokens_so_far = 0 
        
           #     chunk = [] 
        
           #     # Loop through the sentences and tokens joined together in a tuple 
        
           #     for sentence, token in zip(sentences, n_tokens): 
        
           #         # If the number of tokens so far plus the number of tokens in the current sentence is greater  
        
           #         # than the max number of tokens, then add the chunk to the list of chunks and reset 
        
           #         # the chunk and tokens so far 
        
           #         if tokens_so_far + token > max_tokens: 
        
           #             chunks.append(". ".join(chunk) + ".") 
        
           #             chunk = [] 
        
           #             tokens_so_far = 0 
        
           #         # If the number of tokens in the current sentence is greater than the max number of  
        
           #         # tokens, go to the next sentence 
        
           #         if token > max_tokens: 
        
           #             continue 
        
           #         # Otherwise, add the sentence to the chunk and add the number of tokens to the total 
        
           #         chunk.append(sentence) 
        
           #         tokens_so_far += token + 1 
        
           #     # Add the last chunk to the list of chunks 
        
           #     if chunk: 
        
           #         chunks.append(". ".join(chunk) + ".") 
        
           #     return chunks 
        
           # shortened = [] 
        
           # Loop through the dataframe 
        
           # for row in df.iterrows(): 
        
           #     # If the text is None, go to the next row 
        
           #     if row[1]['text'] is None: 
        
           #         continue 
        
           #     # If the number of tokens is greater than the max number of tokens, split the text into chunks 
        
           #     if row[1]['n_tokens'] > max_tokens: 
        
           #         shortened += split_into_many(row[1]['text']) 
        
           #     # Otherwise, add the text to the list of shortened texts 
        
           #     else: 
        
           #         shortened.append( row[1]['text'] ) 
        
           ################################################################################ 
        
           ### Step 9 
        
           ################################################################################ 
        
           # df = pd.DataFrame(shortened, columns = ['text']) 
        
           # df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x))) 
        
           # df.n_tokens.hist() 
        
           ################################################################################ 
        
           ### Step 10 
        
           ################################################################################ 
        
           # Note that you may run into rate limit issues depending on how many files you try to embed 
        
           # Please check out our rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits 
        
           # df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding']) 
        
           # df.to_csv('processed/embeddings.csv') 
        
           # df.head() 
        
           # ################################################################################ 
        
           # ### Step 11 
        
           # ################################################################################ 
        
           df=pd.read_csv('processed/embeddings.csv', index_col=0) 
        
           df['embeddings'] = df['embeddings'].apply(eval).apply(np.array) 
        
           df.head() 
        
           # ################################################################################ 
        
           # ### Step 12 
        
           # ################################################################################ 
        
           def create_context( 
        
               question, df, max_len=1800, size="ada" 
        
           ): 
        
               """ 
        
               Create a context for a question by finding the most similar context from the dataframe 
        
               """ 
        
               # Get the embeddings for the question 
        
               q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding'] 
        
               # Get the distances from the embeddings 
        
               df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine') 
        
               returns = [] 
        
               cur_len = 0 
        
               # Sort by distance and add the text to the context until the context is too long 
        
               for i, row in df.sort_values('distances', ascending=True).iterrows(): 
        
                   # Add the length of the text to the current length 
        
                   cur_len += row['n_tokens'] + 4 
        
                   # If the context is too long, break 
        
                   if cur_len > max_len: 
        
                       break 
        
                   # Else add it to the text that is being returned 
        
                   returns.append(row["text"]) 
        
               # Return the context 
        
               return "\n\n###\n\n".join(returns) 
        
           def answer_question( 
        
               df, 
        
               model="text-davinci-003", 
        
               question="Am I allowed to publish model outputs to Twitter, without a human review?", 
        
               max_len=1800, 
        
               size="ada", 
        
               debug=False, 
        
               max_tokens=150, 
        
               stop_sequence=None 
        
           ): 
        
               """ 
        
               Answer a question based on the most similar context from the dataframe texts 
        
               """ 
        
               context = create_context( 
        
                   question, 
        
                   df, 
        
                   max_len=max_len, 
        
                   size=size, 
        
               ) 
        
               # If debug, print the raw model response 
        
               if debug: 
        
                   print("Context:\n" + context) 
        
                   print("\n\n") 
        
               try: 
        
                   # Create a completions using the questin and context 
        
                   response = openai.Completion.create( 
        
                       prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"Sorry, I didn't understand that. Please rephrase.\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:", 
        
                       temperature=0, 
        
                       max_tokens=max_tokens, 
        
                       top_p=1, 
        
                       frequency_penalty=0, 
        
                       presence_penalty=0, 
        
                       stop=stop_sequence, 
        
                       model=model, 
        
                   ) 
        
                   return response["choices"][0]["text"].strip() 
        
               except Exception as e: 
        
                   print(e) 
        
                   return "" 
        
           ################################################################################ 
        
           ### Step 13 
        
           ################################################################################ 
        
           print(answer_question(df, question="How do I publish? Please show me a relevant training link pertaining to my question in another, separate paragraph titled Relevant Link(s). If my question only asks for a link or page, provide only a link. Remove any html syntax if present.", debug=False))

Name		Name	Last commit message	Last commit date
Latest commit History 12 Commits
api		api
cmswiz		cmswiz
processed		processed
.gitattributes		.gitattributes
.gitignore		.gitignore
Pipfile		Pipfile
Pipfile.lock		Pipfile.lock
README.md		README.md
manage.py		manage.py
scraper.py		scraper.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

berkeley/calhacks summer 2023 backend django app

About

Releases

Packages

Languages

	import os
	import json
	from django.shortcuts import render
	import openai
	import numpy as np
	import pandas as pd
	from django.http import HttpResponse
	from openai.embeddings_utils import distances_from_embeddings, cosine_similarity
	from django.views.decorators.csrf import csrf_exempt

	openai.api_key = os.environ.get("TOKEN")

	# Create your views here.
	@csrf_exempt
	def prompt(request): # post /api/prompt
	if request.method != 'POST':
	data = {'res': 'Only POST requests allowed!'}
	res = HttpResponse(content=json.dumps(
	data), content_type='application/json')
	return res

	df=pd.read_csv('processed/embeddings.csv', index_col=0)
	df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

	df.head()

	data = json.loads(request.body)

	# Access the properties from the JSON data
	prompt = data.get('prompt')

	ans = answer_question(df, question=f"{prompt} Please show me a correct, valid link to a page that also answers my question in a separate paragraph titled Relevant Link(s). Remove any html syntax.", debug=False)
	output = {'answer': ans}
	res = HttpResponse(content=json.dumps(
	output), content_type='application/json')
	return res

	def create_context(
	question, df, max_len=1800, size="ada"
	):
	"""
	Create a context for a question by finding the most similar context from the dataframe
	"""

	# Get the embeddings for the question
	q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

	# Get the distances from the embeddings
	df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')


	returns = []
	cur_len = 0

	# Sort by distance and add the text to the context until the context is too long
	for i, row in df.sort_values('distances', ascending=True).iterrows():

	# Add the length of the text to the current length
	cur_len += row['n_tokens'] + 4

	# If the context is too long, break
	if cur_len > max_len:
	break

	# Else add it to the text that is being returned
	returns.append(row["text"])

	# Return the context
	return "\n\n###\n\n".join(returns)

	def answer_question(
	df,
	model="text-davinci-003",
	question="Am I allowed to publish model outputs to Twitter, without a human review?",
	max_len=1800,
	size="ada",
	debug=False,
	max_tokens=150,
	stop_sequence=None
	):
	"""
	Answer a question based on the most similar context from the dataframe texts
	"""
	context = create_context(
	question,
	df,
	max_len=max_len,
	size=size,
	)
	# If debug, print the raw model response
	if debug:
	print("Context:\n" + context)
	print("\n\n")

	try:
	# Create a completions using the questin and context
	response = openai.Completion.create(
	prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"Sorry, I didn't understand that. Please rephrase.\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
	temperature=0,
	max_tokens=max_tokens,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0,
	stop=stop_sequence,
	model=model,
	)
	return response["choices"][0]["text"].strip()
	except Exception as e:
	print(e)
	return ""

	################################################################################
	### Step 1
	################################################################################

	import requests
	import re
	import urllib.request
	from bs4 import BeautifulSoup
	from collections import deque
	from html.parser import HTMLParser
	from urllib.parse import urlparse
	import os
	import pandas as pd
	import tiktoken
	import openai
	import numpy as np
	from openai.embeddings_utils import distances_from_embeddings, cosine_similarity

	openai.api_key = os.environ.get("TOKEN")

	# Regex pattern to match a URL
	HTTP_URL_PATTERN = r'^http[s]{0,1}://.+$'

	# Define root domain to crawl
	domain = "sitefarm.ucdavis.edu"
	full_url = "https://sitefarm.ucdavis.edu/"

	# Create a class to parse the HTML and get the hyperlinks
	# class HyperlinkParser(HTMLParser):
	# def __init__(self):
	# super().__init__()
	# # Create a list to store the hyperlinks
	# self.hyperlinks = []

	# # Override the HTMLParser's handle_starttag method to get the hyperlinks
	# def handle_starttag(self, tag, attrs):
	# attrs = dict(attrs)
	# # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
	# if tag == "a" and "href" in attrs:
	# self.hyperlinks.append(attrs["href"])

	# ################################################################################
	# ### Step 2
	# ################################################################################

	# # Function to get the hyperlinks from a URL
	# def get_hyperlinks(url):

	# # Try to open the URL and read the HTML
	# try:
	# # Open the URL and read the HTML
	# with urllib.request.urlopen(url) as response:

	# # If the response is not HTML, return an empty list
	# if not response.info().get('Content-Type').startswith("text/html"):
	# return []

	# # Decode the HTML
	# html = response.read().decode('utf-8')
	# except Exception as e:
	# print(e)
	# return []

	# # Create the HTML Parser and then Parse the HTML to get hyperlinks
	# parser = HyperlinkParser()
	# parser.feed(html)

	# return parser.hyperlinks

	# ################################################################################
	# ### Step 3
	# ################################################################################

	# # Function to get the hyperlinks from a URL that are within the same domain
	# def get_domain_hyperlinks(local_domain, url):
	# clean_links = []
	# for link in set(get_hyperlinks(url)):
	# clean_link = None

	# # If the link is a URL, check if it is within the same domain
	# if re.search(HTTP_URL_PATTERN, link):
	# # Parse the URL and check if the domain is the same
	# url_obj = urlparse(link)
	# if url_obj.netloc == local_domain:
	# clean_link = link

	# # If the link is not a URL, check if it is a relative link
	# else:
	# if link.startswith("/"):
	# link = link[1:]
	# elif (
	# link.startswith("#")
	# or link.startswith("mailto:")
	# or link.startswith("tel:")
	# ):
	# continue
	# url_obj = urlparse(link)
	# clean_link = "https://" + local_domain + "/" + link

	# if clean_link is not None:
	# if clean_link.endswith("/"):
	# clean_link = clean_link[:-1]
	# clean_links.append(clean_link)

	# # Return the list of hyperlinks that are within the same domain
	# return list(set(clean_links))


	# ################################################################################
	# ### Step 4
	# ################################################################################

	# def crawl(url):
	# # Parse the URL and get the domain
	# local_domain = urlparse(url).netloc

	# # Create a queue to store the URLs to crawl
	# queue = deque([url])

	# # Create a set to store the URLs that have already been seen (no duplicates)
	# seen = set([url])

	# # Create a directory to store the text files
	# if not os.path.exists("text/"):
	# os.mkdir("text/")

	# if not os.path.exists("text/"+local_domain+"/"):
	# os.mkdir("text/" + local_domain + "/")

	# # Create a directory to store the csv files
	# if not os.path.exists("processed"):
	# os.mkdir("processed")

	# # While the queue is not empty, continue crawling
	# while queue:

	# # Get the next URL from the queue
	# url = queue.pop()
	# print(url) # for debugging and to see the progress

	# # Save text from the url to a <url>.txt file
	# with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

	# # Get the text from the URL using BeautifulSoup
	# soup = BeautifulSoup(requests.get(url).text, "html.parser")

	# # Get the text but remove the tags
	# text = soup.get_text()

	# # If the crawler gets to a page that requires JavaScript, it will stop the crawl
	# if ("You need to enable JavaScript to run this app." in text):
	# print("Unable to parse page " + url + " due to JavaScript being required")

	# # Otherwise, write the text to the file in the text directory
	# f.write(text)

	# # Get the hyperlinks from the URL and add them to the queue
	# for link in get_domain_hyperlinks(local_domain, url):
	# if link not in seen:
	# queue.append(link)
	# seen.add(link)

	# crawl(full_url)

	###############################################################################
	## Step 5
	###############################################################################

	# def remove_newlines(serie):
	# serie = serie.str.replace('\n', ' ')
	# serie = serie.str.replace('\\n', ' ')
	# serie = serie.str.replace(' ', ' ')
	# serie = serie.str.replace(' ', ' ')
	# return serie


	# ################################################################################
	# ### Step 6
	# ################################################################################

	# # Create a list to store the text files
	# texts=[]

	# # Get all the text files in the text directory
	# for file in os.listdir("text/" + domain + "/"):

	# # Open the file and read the text
	# with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f:
	# text = f.read()

	# # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
	# texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))

	# # Create a dataframe from the list of texts
	# df = pd.DataFrame(texts, columns = ['fname', 'text'])

	# # Set the text column to be the raw text with the newlines removed
	# df['text'] = df.fname + ". " + remove_newlines(df.text)
	# df.to_csv('processed/scraped.csv')
	# df.head()

	# ################################################################################
	# ### Step 7
	# ################################################################################

	# # Load the cl100k_base tokenizer which is designed to work with the ada-002 model
	# tokenizer = tiktoken.get_encoding("cl100k_base")

	# df = pd.read_csv('processed/scraped.csv', index_col=0)
	# df.columns = ['title', 'text']

	# # Tokenize the text and save the number of tokens to a new column
	# df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

	# # Visualize the distribution of the number of tokens per row using a histogram
	# df.n_tokens.hist()

	# ################################################################################
	# ### Step 8
	# ################################################################################

	# max_tokens = 500

	# # Function to split the text into chunks of a maximum number of tokens
	# def split_into_many(text, max_tokens = max_tokens):

	# # Split the text into sentences
	# sentences = text.split('. ')

	# # Get the number of tokens for each sentence
	# n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]

	# chunks = []
	# tokens_so_far = 0
	# chunk = []

	# # Loop through the sentences and tokens joined together in a tuple
	# for sentence, token in zip(sentences, n_tokens):

	# # If the number of tokens so far plus the number of tokens in the current sentence is greater
	# # than the max number of tokens, then add the chunk to the list of chunks and reset
	# # the chunk and tokens so far
	# if tokens_so_far + token > max_tokens:
	# chunks.append(". ".join(chunk) + ".")
	# chunk = []
	# tokens_so_far = 0

	# # If the number of tokens in the current sentence is greater than the max number of
	# # tokens, go to the next sentence
	# if token > max_tokens:
	# continue

	# # Otherwise, add the sentence to the chunk and add the number of tokens to the total
	# chunk.append(sentence)
	# tokens_so_far += token + 1

	# # Add the last chunk to the list of chunks
	# if chunk:
	# chunks.append(". ".join(chunk) + ".")

	# return chunks


	# shortened = []

	# Loop through the dataframe
	# for row in df.iterrows():

	# # If the text is None, go to the next row
	# if row[1]['text'] is None:
	# continue

	# # If the number of tokens is greater than the max number of tokens, split the text into chunks
	# if row[1]['n_tokens'] > max_tokens:
	# shortened += split_into_many(row[1]['text'])

	# # Otherwise, add the text to the list of shortened texts
	# else:
	# shortened.append( row[1]['text'] )

	################################################################################
	### Step 9
	################################################################################

	# df = pd.DataFrame(shortened, columns = ['text'])
	# df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
	# df.n_tokens.hist()

	################################################################################
	### Step 10
	################################################################################

	# Note that you may run into rate limit issues depending on how many files you try to embed
	# Please check out our rate limit guide to learn more on how to handle this: https://platform.openai.com/docs/guides/rate-limits

	# df['embeddings'] = df.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
	# df.to_csv('processed/embeddings.csv')
	# df.head()

	# ################################################################################
	# ### Step 11
	# ################################################################################

	df=pd.read_csv('processed/embeddings.csv', index_col=0)
	df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

	df.head()

	# ################################################################################
	# ### Step 12
	# ################################################################################

	def create_context(
	question, df, max_len=1800, size="ada"
	):
	"""
	Create a context for a question by finding the most similar context from the dataframe
	"""

	# Get the embeddings for the question
	q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

	# Get the distances from the embeddings
	df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')


	returns = []
	cur_len = 0

	# Sort by distance and add the text to the context until the context is too long
	for i, row in df.sort_values('distances', ascending=True).iterrows():

	# Add the length of the text to the current length
	cur_len += row['n_tokens'] + 4

	# If the context is too long, break
	if cur_len > max_len:
	break

	# Else add it to the text that is being returned
	returns.append(row["text"])

	# Return the context
	return "\n\n###\n\n".join(returns)

	def answer_question(
	df,
	model="text-davinci-003",
	question="Am I allowed to publish model outputs to Twitter, without a human review?",
	max_len=1800,
	size="ada",
	debug=False,
	max_tokens=150,
	stop_sequence=None
	):
	"""
	Answer a question based on the most similar context from the dataframe texts
	"""
	context = create_context(
	question,
	df,
	max_len=max_len,
	size=size,
	)
	# If debug, print the raw model response
	if debug:
	print("Context:\n" + context)
	print("\n\n")

	try:
	# Create a completions using the questin and context
	response = openai.Completion.create(
	prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"Sorry, I didn't understand that. Please rephrase.\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
	temperature=0,
	max_tokens=max_tokens,
	top_p=1,
	frequency_penalty=0,
	presence_penalty=0,
	stop=stop_sequence,
	model=model,
	)
	return response["choices"][0]["text"].strip()
	except Exception as e:
	print(e)
	return ""

	################################################################################
	### Step 13
	################################################################################

	print(answer_question(df, question="How do I publish? Please show me a relevant training link pertaining to my question in another, separate paragraph titled Relevant Link(s). If my question only asks for a link or page, provide only a link. Remove any html syntax if present.", debug=False))

BustosAndrew/berkeley

Folders and files

Latest commit

History

Repository files navigation

berkeley/calhacks summer 2023 backend django app

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages