upload (70f4914d) · Commits · hillengass / SynDRA

preprocessing/extract_dialogues.py

0 → 100644

+39 −0

Original line number	Diff line number	Diff line
		import json

		def extract_info_from_dialogue(dialogue):
		"""Extracts service information and utterances from a single dialogue."""
		knowledge_base = dialogue['services']
		utterances_list = []

		# Extracting service information where active_intent is not NONE
		active_services = [frame['service'] for turn in dialogue['turns'] for frame in turn.get('frames', []) if frame.get('state', {}).get('active_intent') != "NONE"]

		for turn in dialogue['turns']:
		speaker_utterance = f"{turn['speaker']}: {turn['utterance']}"
		utterances_list.append(speaker_utterance)

		return {
		"knowledge_base": knowledge_base,
		"active_services": active_services,
		"utterances_list": utterances_list
		}

		# Input the filename from the user
		filename = "dialogues.json"

		# Load the data from the file
		with open(filename, 'r') as file:
		data = json.load(file)

		# Process and save each dialogue separately
		for i, dialogue in enumerate(data):
		extracted_info = extract_info_from_dialogue(dialogue)
		output_filename = f"extracted_dialogue_{i + 1}.txt"

		# Format the extracted information and save it to a text file
		with open(output_filename, 'w') as outfile:
		outfile.write("Knowledge Base: " + ', '.join(extracted_info['knowledge_base']) + '\n')
		outfile.write("Active Services: " + ', '.join(extracted_info['active_services']) + '\n\n')
		outfile.write('\n'.join(extracted_info['utterances_list']))

		print(f"Saved extracted data for dialogue {i + 1} to {output_filename}")

preprocessing/extract_domain_knowledge.py

0 → 100644

+61 −0

Original line number	Diff line number	Diff line
		import json


		def extract_active_frames_from_dialogue(dialogue):
		"""Extracts frames with active intents from a single dialogue."""
		# Extract frames where active_intent is not NONE
		active_frames = [frame for turn in dialogue['turns'] for frame in turn.get('frames', []) if
		frame.get('state', {}).get('active_intent') != "NONE"]
		return active_frames


		def format_frame(frame):
		# Extract primary information
		service = frame.get('service', 'Unknown Service')
		active_intent = frame.get('state', {}).get('active_intent', 'Unknown Intent')

		# Start with the service and intent
		formatted_info = f"Service: {service}\nIntent: {active_intent}\n"

		# Extract slots and their values
		for slot in frame.get('slots', []):
		slot_name = slot.get('slot', 'Unknown Slot')
		value = slot.get('value', 'Unknown Value')
		start = slot.get('start', '??')
		end = slot.get('exclusive_end', '??')
		formatted_info += f" Slot: {slot_name} -> {value} (Position: {start}-{end})\n"

		# Extract slot values from state
		slot_values = frame.get('state', {}).get('slot_values', {})
		if slot_values:
		formatted_info += " Slot Values:\n"
		for key, values in slot_values.items():
		formatted_info += f" {key} -> {', '.join(values)}\n"

		return formatted_info


		# Input the filename from the user
		filename = "dialogues.json"

		# Load the data from the file
		with open(filename, 'r') as file:
		data = json.load(file)

		# Process and save each dialogue's frames separately
		for i, dialogue in enumerate(data):
		extracted_frames = extract_active_frames_from_dialogue(dialogue)

		if extracted_frames: # Only save if there are active frames
		formatted_text = ""
		for frame in extracted_frames:
		formatted_text += format_frame(frame)
		formatted_text += '-' * 50 + '\n' # Separate frames

		output_filename = f"formatted_frames_dialogue_{i + 1}.txt"

		# Save the formatted text to a file
		with open(output_filename, 'w') as outfile:
		outfile.write(formatted_text)

		print(f"Saved formatted frames for dialogue {i + 1} to {output_filename}")