Generalizes preprocessing to all data types (d66a6412) · Commits · hillengass / SynDRA

notebooks/knowledge_base.ipynb

+109 −102

Original line number	Diff line number	Diff line
		%% Cell type:markdown id: tags:

		# HeidelKBerg

		A knowledge base for retrieval-assisted generation of task-oriented dialogues.

		%% Cell type:markdown id: tags:

		## Data

		We get the data we want from OpenStreetMaps [Overpass API](overpass-turbo.eu) with this query:

		```sql
		[out:json][timeout:25];
		{{geocodeArea:Heidelberg}}->.searchArea;
		(
		nwr["amenity"="restaurant"](area.searchArea);
		nwr["amenity"="museum"](area.searchArea);
		nwr["tourism"="hotel"](area.searchArea);
		nwr["tourism"="attraction"](area.searchArea);
		);
		out geom;
		```

		and export it to `@/data/osm/osm_export.geojson`.

		%% Cell type:markdown id: tags:

		## Schema

		We define the schema of our knowledge base with pydantic types.

		%% Cell type:code id: tags:

		``` python
		from typing import List, Literal
		from pydantic import BaseModel, computed_field


		class Address(BaseModel):
		street: str
		housenumber: str
		# suburb list from https://www.heidelberg.de/hd/HD/Leben/Stadtteile.html
		suburb: Literal[
		"Altstadt",
		"Bahnstadt",
		"Bergheim",
		"Boxberg",
		"Emmertsgrund",
		"Handschuhsheim",
		"Kirchheim",
		"Neuenheim",
		"Pfaffengrund",
		"Rohrbach",
		"Schlierbach",
		"Südstadt",
		"Weststadt",
		"Wieblingen",
		"Ziegelhausen"
		]

		# Since all types that have an address have an area, we compute the area
		# on address level to minimize code duplicity, although it technically is
		# no address property
		@computed_field
		@property
		def area(self) -> Literal["center", "east", "north", "south", "west"]:
		area = {
		"Altstadt": "center",
		"Bahnstadt": "center",
		"Bergheim": "center",
		"Boxberg": "south",
		"Emmertsgrund": "south",
		"Handschuhsheim": "north",
		"Kirchheim": "south",
		"Neuenheim": "north",
		"Pfaffengrund": "west",
		"Rohrbach": "south",
		"Schlierbach": "east",
		"Südstadt": "south",
		"Weststadt": "center",
		"Wieblingen": "west",
		"Ziegelhausen": "east"
		}
		return area[self.suburb]


		class Venue(BaseModel):
		name: str
		description: str
		address: Address


		class Hotel(Venue):
		phone: str
		website: str
		price: Literal["cheap", "moderate", "expensive"]
		stars: Literal["0", "1", "2", "3", "4", "5", "unknown"]
		air_conditioning: Literal["yes", "no", "unknown"]
		internet_access: Literal["yes", "no", "unknown"]
		parking: Literal["yes", "no", "unknown"]
		smoking: Literal["yes", "no", "unknown"]
		wheelchair: Literal["yes", "no", "unknown"]


		class Restaurant(Venue):
		phone: str
		website: str
		price: Literal["cheap", "moderate", "expensive"]
		internet_access: Literal["yes", "no", "unknown"]
		smoking: Literal["yes", "no", "unknown"]
		wheelchair: Literal["yes", "no", "limited" "unknown"]
		bar: Literal["yes", "no", "unknown"]
		delivery: Literal["yes", "no", "unknown"]
		vegan: Literal["yes", "no", "limited", "unknown"]
		vegetarian: Literal["yes", "no", "limited", "unknown"]
		indoor_seating: Literal["yes", "no", "limited", "unknown"]
		outdoor_seating: Literal["yes", "no", "limited", "unknown"]
		cuisine: List[Literal[
		"african",
		"american",
		"asian",
		"chinese",
		"german",
		"greek",
		"indian",
		"international",
		"italian",
		"japanese",
		"korean",
		"oriental",
		"sushi",
		"thai",
		]]


		class Attraction(Venue):
		attraction_type: List[Literal[
		"zoo",
		"architecture",
		"view",
		"historic",
		]]


		#class Museum(Attraction):
		# theme: str


		#class Theatre(Attraction):
		# program: List[Literal[
		# "Monday",
		# "Tuesday",
		# "Wednesday",
		# "Thursday",
		# "Friday",
		# "Saturday",
		# "Sunday",
		# ]]


		#class TouristAttraction(Attraction):
		# view: Literal["yes", "no", "unknown"]
		```

		%% Cell type:markdown id: tags:

		We are importing our data from `osm_export.geojson` and split them into restaurants, hotels and attractions for further processing.
		We are using three output files to easier split the data for three manual annotators.

		%% Cell type:code id: tags:

		``` python
		import json
		import ndjson
		from collections import defaultdict


		restaurants = []
		hotels = []
		attractions = []

		# Load data and split into types
		with open("../data/osm/osm_export.geojson") as file:
		data = json.load(file)
		k = defaultdict(int)
		for node in data["features"]:
		node = node["properties"]
		if "amenity" in node.keys():
		if node["amenity"] == "restaurant":
		restaurants.append(node)
		if "tourism" in node.keys():
		if node["tourism"] == "hotel":
		hotels.append(node)
		if node["tourism"] == "attraction":
		attractions.append(node)
		```

		%% Cell type:markdown id: tags:

		## Restaurants

		Here we are processing the restaurants. We start by declaring our desired keys.
		Then we remove all unnecessary keys and fill missing keys with a marker.

		%% Cell type:code id: tags:

		``` python
		restaurant_keys = [
		"name",
		"description",
		"addr:street",
		"addr:housenumber",
		"addr:suburb",
		"phone",
		"contact:phone",
		"website",
		"contact:website",
		"internet_access",
		"smoking",
		"wheelchair",
		"delivery",
		"vegan",
		"diet:vegan",
		"vegetarian",
		"diet:vegetarian",
		"indoor_seating",
		"outdoor_seating",
		"cuisine",
		"price",
		]

		# Setup 3 output files to easily split manual editing by 3
		of0 = []
		of1 = []
		of2 = []

		# Iterating over restaurants
		i = 0
		for r in restaurants:
		rk = list(r.keys())

		# Filter out the unwanted properties
		for key in rk:
		if key not in restaurant_keys:
		r.pop(key)

		# Homogenize labels
		if "diet:vegan" in rk:
		r["vegan"] = r["diet:vegan"]
		r.pop("diet:vegan")
		if "diet:vegetarian" in rk:
		r["vegetarian"] = r["diet:vegetarian"]
		r.pop("diet:vegetarian")
		if "conact:phone" in rk:
		r["phone"] = r["contact:phone"]
		r.pop("contact:phone")
		if "conact:website" in rk:
		r["phone"] = r["contact:website"]
		r.pop("contact:website")

		superflous_labels = [
		"diet:vegan",
		"diet:vegetarian",
		def write_data(label: str, data: list):
		"""
		Takes a list of data points of homogenous data type, homogenizes some data keys and removes superfluous keys.
		It then writes the data split into thre json files.

		@input label: str
		@input data: list

		@output void
		"""

		# Define possible labels
		data_keys = [
		"name",
		"description",
		"addr:street",
		"addr:housenumber",
		"addr:suburb",
		"phone",
		"contact:phone",
		"website",
		"contact:website",
		"internet_access",
		"smoking",
		"wheelchair",
		"delivery",
		"vegan",
		"diet:vegan",
		"vegetarian",
		"diet:vegetarian",
		"indoor_seating",
		"outdoor_seating",
		"cuisine",
		"price",
		]

		# Put a "unknown" marker inside empty properties
		for key in restaurant_keys:
		if key not in r.keys():
		if key in superflous_labels:
		continue
		r[key] = "unknown"


		if i == 0:
		of0.append(dict(sorted(r.items(), key=lambda item: item[0])))
		if i == 1:
		of1.append(dict(sorted(r.items(), key=lambda item: item[0])))
		if i == 2:
		of2.append(dict(sorted(r.items(), key=lambda item: item[0])))

		i += 1
		i %= 3



		o0 = {"data": of0}
		o1 = {"data": of1}
		o2 = {"data": of2}

		output0 = open("../data/osm/restaurants_1.json", "w")
		output1 = open("../data/osm/restaurants_2.json", "w")
		output2 = open("../data/osm/restaurants_3.json", "w")

		output0.write(json.dumps(o0, indent=2, ensure_ascii=False))
		output1.write(json.dumps(o1, indent=2, ensure_ascii=False))
		output2.write(json.dumps(o2, indent=2, ensure_ascii=False))

		output0.close()
		output1.close()
		output2.close()
		# Prepare to split output data to 3 files
		of0 = []
		of1 = []
		of2 = []

		# Iterating over data points
		i = 0
		for data_point in data:
		rk = list(data_point.keys())

		# Filter out the unwanted properties
		for key in rk:
		if key not in data_keys:
		data_point.pop(key)

		# Homogenize labels
		if "diet:vegan" in rk:
		data_point["vegan"] = data_point["diet:vegan"]
		data_point.pop("diet:vegan")
		if "diet:vegetarian" in rk:
		data_point["vegetarian"] = data_point["diet:vegetarian"]
		data_point.pop("diet:vegetarian")
		if "conact:phone" in rk:
		data_point["phone"] = data_point["contact:phone"]
		data_point.pop("contact:phone")
		if "conact:website" in rk:
		data_point["phone"] = data_point["contact:website"]
		data_point.pop("contact:website")

		superflous_labels = [
		"diet:vegan",
		"diet:vegetarian",
		"contact:phone",
		"contact:website",
		]

		# Put a "unknown" marker inside empty properties
		for key in data_keys:
		if key not in data_point.keys():
		if key in superflous_labels:
		continue
		data_point[key] = "unknown"


		if i == 0:
		of0.append(dict(sorted(data_point.items(), key=lambda item: item[0])))
		if i == 1:
		of1.append(dict(sorted(data_point.items(), key=lambda item: item[0])))
		if i == 2:
		of2.append(dict(sorted(data_point.items(), key=lambda item: item[0])))

		i += 1
		i %= 3



		o0 = {"data": of0}
		o1 = {"data": of1}
		o2 = {"data": of2}

		output0 = open(f"../data/osm/{label}_1.json", "w")
		output1 = open(f"../data/osm/{label}_2.json", "w")
		output2 = open(f"../data/osm/{label}_3.json", "w")

		output0.write(json.dumps(o0, indent=2, ensure_ascii=False))
		output1.write(json.dumps(o1, indent=2, ensure_ascii=False))
		output2.write(json.dumps(o2, indent=2, ensure_ascii=False))

		output0.close()
		output1.close()
		output2.close()

		# Write preprocessed data to output files
		write_data("hotels", hotels)
		write_data("restaurants", restaurants)
		write_data("attractions", attractions)
		```