Introduction
Extracting contact details from online sources can be time-consuming. This Python script automates the process using Google Search API and OpenAI’s GPT-3.5 to extract names and emails efficiently.
Performing Google Search
The script sends search queries via Google’s Custom Search API. It retrieves up to 100 results in batches of 10, collecting snippets containing potential contact information.
Extracting Contact Information
The extracted snippets are processed using OpenAI’s GPT-3.5. The AI is prompted to extract names and emails, simplifying manual data collection efforts.
Regex-Based Data Parsing
A regex-based function helps validate extracted emails and names. This ensures structured and accurate contact information before saving results.
Saving Data to Excel
Using pandas, the script compiles the extracted details into an Excel sheet, allowing easy access and further processing of the retrieved information.
Conclusion
This script automates lead generation by integrating Google Search and AI-driven contact extraction. It streamlines data collection, saving time and effort for businesses and researchers.
Here is the complete Python script:
import requests
import openai
import pandas as pd
import re
# Set your API keys
GOOGLE_API_KEY = 'YOUR GOOGLE API KEY'
GOOGLE_CSE_ID = 'GOOGLE CSE ID'
OPENAI_API_KEY = 'YOUR OPENAI API KEY'
openai.api_key = OPENAI_API_KEY
# Function to perform Google search
def google_search(query, api_key, cse_id, num_results=10, start=1):
url = "https://www.googleapis.com/customsearch/v1"
params = {
'key': api_key,
'cx': cse_id,
'q': query,
'num': num_results,
'start': start
}
response = requests.get(url, params=params)
return response.json()
# Helper function to extract name and email using regex
def extract_name_email(text):
name = None
email = None
# Regex for extracting email
email_match = re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
if email_match:
email = email_match.group(0)
# Regex for extracting name (assuming "Name: X" format in the response)
name_match = re.search(r'Name: ([\w\s]+)', text)
if name_match:
name = name_match.group(1).strip()
return name, email
# Main search process with pagination for more results
search_query = 'site:facebook.com "Content Creator" "@gmail.com" OR "@hotmail.com" OR "@yahoo.com" OR "@outlook.com" OR "@aol.com" OR "@msn.com" "IN"'
total_results = 100 # Set this to the total number of results you want
results_per_page = 10 # Google API allows a max of 10 per page
start = 1
data = []
snippets = [] # Collect all snippets for batch processing later
# Loop through the pages of results
while start <= total_results:
results = google_search(search_query, GOOGLE_API_KEY, GOOGLE_CSE_ID, num_results=results_per_page, start=start)
for item in results.get('items', []):
title = item.get('title')
snippet = item.get('snippet')
# Collect snippets for batch processing
snippets.append(snippet)
# Store the raw data for later
data.append({
'Title': title,
'Snippet': snippet,
'Name': None, # Placeholder, will fill later
'Email': None # Placeholder, will fill later
})
start += results_per_page # Move to the next page of results
# Combine all snippets into one large prompt to send to OpenAI
combined_snippet_text = "Extract name, email address from the following text:\n\n"
combined_snippet_text += "\n\n".join(snippets)
# Call OpenAI API once with the entire combined text
def call_gpt3_5_turbo(prompt):
try:
# Make a single call to OpenAI API
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}]
)
return response['choices'][0]['message']['content']
except Exception as e:
return str(e)
# Send the combined snippets to the OpenAI API
openai_response = call_gpt3_5_turbo(combined_snippet_text)
# Now process the OpenAI response to extract the name and email
# Assuming the OpenAI response lists the name-email pairs in order
extracted_infos = openai_response.split("\n") # Split response into lines
for i, extracted_info in enumerate(extracted_infos):
name, email = extract_name_email(extracted_info)
# Update the respective record in the data list
if i < len(data): # Ensure we are within bounds
data[i]['Name'] = name
data[i]['Email'] = email
# Save to an Excel file using pandas
df = pd.DataFrame(data)
df.to_excel('google_search_results.xlsx', index=False)
print(f"Data for {len(data)} contacts has been saved to google_search_results.xlsx")