Fix schema (09570e4c) · Commits · hillengass / SynDRA

notebooks/knowledge_base.ipynb

+14 −14

Original line number	Diff line number	Diff line
		%% Cell type:markdown id: tags:

		# HeidelKBerg

		A knowledge base for retrieval-assisted generation of task-oriented dialogues.

		%% Cell type:markdown id: tags:

		## Data

		We get the data we want from OpenStreetMaps [Overpass API](overpass-turbo.eu) with this query:

		```sql
		[out:json][timeout:25];
		{{geocodeArea:Heidelberg}}->.searchArea;
		(
		nwr["amenity"="restaurant"](area.searchArea);
		nwr["amenity"="museum"](area.searchArea);
		nwr["tourism"="hotel"](area.searchArea);
		nwr["tourism"="attraction"](area.searchArea);
		);
		out geom;
		```

		and export it to `@/data/osm/osm_export.geojson`.

		%% Cell type:markdown id: tags:

		## Schema

		We define the schema of our knowledge base with pydantic types.

		%% Cell type:code id: tags:

		``` python
		from typing import List, Literal
		from pydantic import BaseModel, computed_field


		class Address(BaseModel):
		street: str
		housenumber: str
		# suburb list from https://www.heidelberg.de/hd/HD/Leben/Stadtteile.html
		suburb: Literal[
		"Altstadt",
		"Bahnstadt",
		"Bergheim",
		"Boxberg",
		"Emmertsgrund",
		"Handschuhsheim",
		"Kirchheim",
		"Neuenheim",
		"Pfaffengrund",
		"Rohrbach",
		"Schlierbach",
		"Südstadt",
		"Weststadt",
		"Wieblingen",
		"Ziegelhausen"
		]

		# Since all types that have an address have an area, we compute the area
		# on address level to minimize code duplicity, although it technically is
		# no address property
		@computed_field
		@property
		def area(self) -> Literal["center", "east", "north", "south", "west"]:
		area = {
		"Altstadt": "center",
		"Bahnstadt": "center",
		"Bergheim": "center",
		"Boxberg": "south",
		"Emmertsgrund": "south",
		"Handschuhsheim": "north",
		"Kirchheim": "south",
		"Neuenheim": "north",
		"Pfaffengrund": "west",
		"Rohrbach": "south",
		"Schlierbach": "east",
		"Südstadt": "south",
		"Weststadt": "center",
		"Wieblingen": "west",
		"Ziegelhausen": "east"
		}
		return area[self.suburb]


		class Venue(BaseModel):
		name: str
		description: str
		address: Address


		class Hotel(Venue):
		phone: str
		website: str
		price: Literal["cheap", "moderate", "expensive"]
		stars: Literal["0", "1", "2", "3", "4", "5", "unknown"]
		air_conditioning: Literal["yes", "no", "unknown"]
		internet_access: Literal["yes", "no", "unknown"]
		parking: Literal["yes", "no", "unknown"]
		smoking: Literal["yes", "no", "unknown"]
		wheelchair: Literal["yes", "no", "unknown"]


		class Cuisine(BaseModel):
		african: bool,
		american: bool,
		asian: bool,
		chinese: bool,
		german: bool,
		greek: bool,
		indian: bool,
		international: bool,
		italian: bool,
		japanese: bool,
		korean: bool,
		oriental: bool,
		sushi: bool,
		thai: bool,
		"african": bool
		"american": bool
		"asian": bool
		"chinese": bool
		"german": bool
		"greek": bool
		"indian": bool
		"international": bool
		"italian": bool
		"japanese": bool
		"korean": bool
		"oriental": bool
		"sushi": bool
		"thai": bool


		class Restaurant(Venue):
		phone: str
		website: str
		price: Literal["cheap", "moderate", "expensive"]
		internet_access: Literal["yes", "no", "unknown"]
		smoking: Literal["yes", "no", "unknown"]
		wheelchair: Literal["yes", "no", "limited" "unknown"]
		bar: Literal["yes", "no", "unknown"]
		delivery: Literal["yes", "no", "unknown"]
		vegan: Literal["yes", "no", "limited", "unknown"]
		vegetarian: Literal["yes", "no", "limited", "unknown"]
		indoor_seating: Literal["yes", "no", "limited", "unknown"]
		outdoor_seating: Literal["yes", "no", "limited", "unknown"]
		cuisine: Cuisine


		class Attraction(Venue):
		attraction_type: List[Literal[
		"zoo",
		"architecture",
		"view",
		"historic",
		]]


		#class Museum(Attraction):
		# theme: str


		#class Theatre(Attraction):
		# program: List[Literal[
		# "Monday",
		# "Tuesday",
		# "Wednesday",
		# "Thursday",
		# "Friday",
		# "Saturday",
		# "Sunday",
		# ]]


		#class TouristAttraction(Attraction):
		# view: Literal["yes", "no", "unknown"]
		```

		%% Output

		Cell In[3], line 72
		african: bool,
		^
		SyntaxError: invalid syntax

		%% Cell type:markdown id: tags:

		We are importing our data from `osm_export.geojson` and split them into restaurants, hotels and attractions for further processing.
		We are using three output files to easier split the data for three manual annotators.

		%% Cell type:code id: tags:

		``` python
		import json
		from collections import defaultdict


		restaurants = []
		hotels = []
		attractions = []

		# Load data and split into types
		with open("../data/osm/osm_export.geojson") as file:
		data = json.load(file)
		k = defaultdict(int)
		for node in data["features"]:
		node = node["properties"]
		if "amenity" in node.keys():
		if node["amenity"] == "restaurant":
		restaurants.append(node)
		if "tourism" in node.keys():
		if node["tourism"] == "hotel":
		hotels.append(node)
		if node["tourism"] == "attraction":
		attractions.append(node)


		def write_data(label: str, data: list):
		"""
		Takes a list of data points of homogenous data type, homogenizes some data keys and removes superfluous keys.
		It then writes the data split into thre json files.

		@input label: str
		@input data: list

		@output void
		"""

		# Define possible labels
		data_keys = [
		"name",
		"description",
		"addr:street",
		"street",
		"addr:housenumber",
		"housenumber",
		"addr:suburb",
		"suburb",
		"phone",
		"contact:phone",
		"website",
		"contact:website",
		"internet_access",
		"smoking",
		"wheelchair",
		"delivery",
		"vegan",
		"diet:vegan",
		"vegetarian",
		"diet:vegetarian",
		"indoor_seating",
		"outdoor_seating",
		"cuisine",
		"price",
		"type",
		"edited",
		]


		# Prepare to split output data to 3 files
		of0 = []
		of1 = []
		of2 = []

		# Iterating over data points
		i = 0
		for data_point in data:
		rk = list(data_point.keys())

		# Filter out the unwanted properties
		for key in rk:
		if key not in data_keys:
		data_point.pop(key)


		# Homogenize labels
		# Vegan/Vegetarion
		if "diet:vegan" in rk:
		data_point["vegan"] = data_point["diet:vegan"]
		data_point.pop("diet:vegan")
		if "diet:vegetarian" in rk:
		data_point["vegetarian"] = data_point["diet:vegetarian"]
		data_point.pop("diet:vegetarian")

		# Contact
		if "contact:phone" in rk:
		data_point["phone"] = data_point["contact:phone"]
		data_point.pop("contact:phone")
		if "contact:website" in rk:
		data_point["phone"] = data_point["contact:website"]
		data_point.pop("contact:website")

		# Address
		if "addr:street" in rk:
		data_point["street"] = data_point["addr:street"]
		data_point.pop("addr:street")
		if "addr:housenumber" in rk:
		data_point["housenumber"] = data_point["addr:housenumber"]
		data_point.pop("addr:housenumber")
		if "addr:suburb" in rk:
		data_point["suburb"] = data_point["addr:suburb"]
		data_point.pop("addr:suburb")


		# Cuisine
		default_cuisine = {
		"african": False,
		"american": False,
		"asian": False,
		"chinese": False,
		"german": False,
		"greek": False,
		"indian": False,
		"international": False,
		"italian": False,
		"japanese": False,
		"korean": False,
		"oriental": False,
		"sushi": False,
		"thai": False,
		}
		if label == "restaurants":
		new_cuisine = default_cuisine
		if "cuisine" in data_point.keys():
		for cuisine in data_point["cuisine"].split(";"):
		if cuisine in new_cuisine.keys():
		new_cuisine[cuisine] = True
		data_point["cuisine"] = new_cuisine


		superflous_labels = [
		"diet:vegan",
		"diet:vegetarian",
		"contact:phone",
		"contact:website",
		"addr:housenumber",
		"addr:street",
		"addr:suburb"
		]

		# Put a "unknown" marker inside empty properties
		for key in data_keys:
		if key not in data_point.keys():
		if key in superflous_labels:
		continue
		elif key == "type":
		data_point["type"] = label.rstrip("s")
		elif key == "edited":
		data_point["edited"] = False
		else:
		data_point[key] = "unknown"


		if i == 0:
		of0.append(dict(sorted(data_point.items(), key=lambda item: item[0])))
		if i == 1:
		of1.append(dict(sorted(data_point.items(), key=lambda item: item[0])))
		if i == 2:
		of2.append(dict(sorted(data_point.items(), key=lambda item: item[0])))

		i += 1
		i %= 3



		o0 = {"data": of0}
		o1 = {"data": of1}
		o2 = {"data": of2}

		output0 = open(f"../data/osm/{label}_1.json", "w")
		output1 = open(f"../data/osm/{label}_2.json", "w")
		output2 = open(f"../data/osm/{label}_3.json", "w")

		output0.write(json.dumps(o0, indent=2, ensure_ascii=False))
		output1.write(json.dumps(o1, indent=2, ensure_ascii=False))
		output2.write(json.dumps(o2, indent=2, ensure_ascii=False))

		output0.close()
		output1.close()
		output2.close()

		# Write preprocessed data to output files
		write_data("hotels", hotels)
		write_data("restaurants", restaurants)
		write_data("attractions", attractions)
		```

		%% Cell type:markdown id: tags:

		### Find Suburbs
		We are using the official city map of Heidelberg to get the suburb for a specific address, the map can be found here: https://geoweb.heidelberg.de/geoportal/.

		The data we extracted from OpenStreetMap is used to get the address for each datapoint. Then we make a request to the heidelberg.de website and find the suburb for that address. After that we fill the suburb value in our json file.

		%% Cell type:code id: tags:

		``` python
		# Imports

		from selenium import webdriver
		from selenium.webdriver.common.keys import Keys
		from selenium.webdriver.common.by import By
		from selenium.webdriver.common.action_chains import ActionChains
		import time
		from webdriver_manager.chrome import ChromeDriverManager
		import json
		```

		%% Cell type:code id: tags:

		``` python
		def find_suburb_for_address(address):

		driver = webdriver.Chrome(ChromeDriverManager().install())

		# Open the website
		driver.get('https://geoweb.heidelberg.de/geoportal/')

		# Wait for the page to load
		time.sleep(6)

		# Activate the checkbox for "Stadtteile"
		checkbox_button = driver.find_element(By.XPATH, "//tr[@data-qtip='Stadtteile']//input[@type='button']")
		checkbox_button.click()

		# Wait a bit
		time.sleep(2)

		# Find the search bar, clear it, and enter an address
		search_bar = driver.find_element(By.ID, 'searchserviceheidelberg-1060-inputEl')
		search_bar.clear()
		search_bar.send_keys(address)

		# Wait for the list
		time.sleep(2)

		# Click on the first item in the list (hopefully ther is just one)
		first_item = driver.find_element(By.CSS_SELECTOR, '#boundlist-1121-listEl ul li')
		first_item.click()

		# Wait for results to load
		time.sleep(2)

		# Click on the center of the screen
		window_size = driver.get_window_size()
		center_x = window_size['width'] // 2
		center_y = window_size['height'] // 2
		ActionChains(driver).move_by_offset(center_x, center_y).click().perform()

		# Wait for the popup bubble
		time.sleep(4)

		# Find the element containing the suburb name and extract its text
		suburbs = [
		"Altstadt", "Bahnstadt", "Bergheim", "Boxberg", "Emmertsgrund",
		"Handschuhsheim", "Kirchheim", "Neuenheim", "Pfaffengrund", "Rohrbach",
		"Schlierbach", "Südstadt", "Weststadt", "Wieblingen", "Ziegelhausen"
		]

		found_suburb = None
		for suburb in suburbs:
		suburb_elements = driver.find_elements(By.XPATH, f"//div[contains(@class, 'x-grid-cell-inner') and contains(text(), '{suburb}')]")
		if suburb_elements:
		found_suburb = suburb
		break

		if found_suburb:
		pass
		else:
		found_suburb = "unknown"


		# Close the browser
		driver.quit()
		return found_suburb
		```

		%% Cell type:code id: tags:

		``` python
		# Read the JSON file
		with open('restaurants_1.json', 'r', encoding='utf-8') as file:
		data = json.load(file)

		# Iterate over each entry and find the suburb
		for entry in data["data"][:20]:
		address = f"{entry['addr:housenumber']} {entry['addr:street']}"
		suburb = find_suburb_for_address(address)
		entry['addr:suburb'] = suburb

		# Save the updated JSON data
		with open('test.json', 'w', encoding='utf-8') as file:
		json.dump(data, file, indent=4)
		```