Summary of Steps¶

  1. Open csv containing all the offenses
  2. Loop through each row and get embedded data for the given offense
  3. Write the offense and embedding to a dataframe
  4. Dump that dataframe to a new .csv file
WARNING: Script will take longer to run due to OpenAI API calls
In [1]:
import os
import math
import requests
import json
import urllib3
import pandas as pd

import openai

raw_file_in = 'police_output.csv'

rawData = pd.read_csv(raw_file_in, usecols = ['OFFENSES'])

off_results = []
emb_results = []

openai.api_key = "sk-OUrLCxcGrUxxxxxxxxxxxxxxxxxxxxxxlqizB6PDP4dXEz"

for index, row in rawData.iterrows():
    #print(row.values) # will be the string followed by the embedding
    row_json = row.to_json()
    #testAdd = "YUP"
    embedding = openai.Embedding.create(input=row_json, model="text-embedding-ada-002")["data"][0]["embedding"]
    #print(row_json)
    off_results.append(str(row.values).replace('[','').replace("'","").replace(']',''))
    emb_results.append(embedding)
    
final_df = pd.DataFrame({'offense':off_results, 'embedding':emb_results})
#final_df
In [2]:
final_df.to_csv('police_with_embeddings.csv', index = False)