-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOpenAI CV.py
115 lines (92 loc) · 3.98 KB
/
OpenAI CV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import openai
import PyPDF2
import docx
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import re
# Configure logging
logging.basicConfig(filename='cv_analysis.log', level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
# Set your OpenAI API key securely (avoid storing directly in code)
openai.api_key = ("This is the magic by wizard Roman")
def is_valid_file(file_path):
"""Checks if the file exists and has content."""
if not os.path.isfile(file_path):
logging.error(f"File not found: {file_path}")
return False
if os.path.getsize(file_path) == 0:
logging.warning(f"Empty file: {file_path}")
return False
return True
def extract_text_from_pdf(pdf_path):
text = ''
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text += page.extract_text()
return text
def extract_text_from_docx(docx_path):
doc = docx.Document(docx_path)
text = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
return text
def extract_text_from_file(file_path):
if not is_valid_file(file_path):
return None
if file_path.lower().endswith('.pdf'):
return extract_text_from_pdf(file_path)
elif file_path.lower().endswith('.docx'):
return extract_text_from_docx(file_path)
else:
raise ValueError('Unsupported file format')
def analyze_cv(cv_text):
prompt =[
{"role": "user",
"content": f"Select the following information from the CV. The answer must be exactly in this format and order: Name:\nMail:\nPhone:\nEducation:\nSkills:.\n\n{cv_text}"},]
response = openai.chat.completions.create(
model="gpt-4-1106-preview",
messages=prompt
)
return response.choices[0].message.content.strip()
def process_cv(file_path):
try:
cv_text = extract_text_from_file(file_path)
if cv_text is None:
return {'File Name': os.path.basename(file_path), 'Extracted Info': 'Error'}
extracted_info = analyze_cv(cv_text)
return {'File Name': os.path.basename(file_path), 'Extracted Info': extracted_info}
except Exception as e:
logging.error(f"Failed to process {file_path}: {e}")
return {'File Name': os.path.basename(file_path), 'Extracted Info': 'Error'}
def process_cvs_in_folder(folder_path):
data = []
with ThreadPoolExecutor(max_workers=10) as executor:
futures = []
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
if file_path.lower().endswith(('.pdf', '.docx')):
futures.append(executor.submit(process_cv, file_path))
for future in as_completed(futures):
data.append(future.result())
df = pd.DataFrame(data)
return df
# Define the folder containing CVs (ensure the folder exists)
folder_path = 'C:\\Users\\uzivatel\\Desktop\\CVs'
if not os.path.exists(folder_path):
logging.error(f"Folder not found: {folder_path}")
exit(1)
# Process CVs and get the result as a DataFrame
cv_data_df = process_cvs_in_folder(folder_path)
def extract_info(text, field):
pattern = rf"{field}: (.*?)(?=\n\w+:|$)"
match = re.search(pattern, text, re.DOTALL)
return match.group(1).strip() if match else ""
cv_data_df['Name'] = cv_data_df['Extracted Info'].apply(lambda x: extract_info(x, 'Name'))
cv_data_df['Mail'] = cv_data_df['Extracted Info'].apply(lambda x: extract_info(x, 'Mail'))
cv_data_df['Phone'] = cv_data_df['Extracted Info'].apply(lambda x: extract_info(x, 'Phone'))
cv_data_df['Education'] = cv_data_df['Extracted Info'].apply(lambda x: extract_info(x, 'Education'))
cv_data_df['Skills'] = cv_data_df['Extracted Info'].apply(lambda x: extract_info(x, 'Skills'))
cv_data_df.drop(columns=['Extracted Info'], inplace=True)
cv_data_df.to_excel(f'{folder_path}\\cv_extracted_data.xlsx', index=False)