LC117
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎mlinspect/instrumentation/analyzers/__init__.py‎ renamed to ‎demo/__init__.py‎ b/‎mlinspect/instrumentation/analyzers/__init__.py‎ renamed to ‎demo/__init__.py‎
diff --git a/‎mlinspect/instrumentation/backends/__init__.py‎ renamed to ‎demo/healthcare/__init__.py‎ b/‎mlinspect/instrumentation/backends/__init__.py‎ renamed to ‎demo/healthcare/__init__.py‎
diff --git a/‎demo/healthcare/demo_utils.py‎
Lines changed: 47 additions & 0 deletions b/‎demo/healthcare/demo_utils.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎demo/healthcare/healthcare.png‎
353 KB b/‎demo/healthcare/healthcare.png‎
353 KB
diff --git a/‎demo/healthcare/healthcare.py‎
Lines changed: 63 additions & 0 deletions b/‎demo/healthcare/healthcare.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎demo/healthcare/healthcare_demo.ipynb‎
Lines changed: 571 additions & 0 deletions b/‎demo/healthcare/healthcare_demo.ipynb‎
Lines changed: 571 additions & 0 deletions
@@ -36,17 +36,17 @@ Prerequisite: python >=  3.8
 Make it easy to analyze your pipeline and automatically check for common issues.
 ```python
 from mlinspect.pipeline_inspector import PipelineInspector
-from mlinspect.instrumentation.analyzers.materialize_first_rows_analyzer import MaterializeFirstRowsAnalyzer
+from mlinspect.inspections.materialize_first_rows_inspection import MaterializeFirstRowsInspection
 
 IPYNB_PATH = ...
 
 inspection_result = PipelineInspector \
         .on_pipeline_from_ipynb_file(IPYNB_PATH) \
-        .add_analyzer(MaterializeFirstRowsAnalyzer(2)) \
+        .add_inspection(MaterializeFirstRowsInspection(2)) \
         .execute()
 
 extracted_dag = inspection_result.dag
-analyzer_results = inspection_result.analyzer_to_annotations
+inspection_to_annotations = inspection_result.inspection_to_annotations
 ```
 
 ## Notes
 
@@ -0,0 +1,47 @@
+"""
+Some useful utils for the project
+"""
+import numpy
+from sklearn.exceptions import NotFittedError
+from gensim.sklearn_api import W2VTransformer
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.models import Sequential
+from tensorflow.python.keras.optimizer_v2.gradient_descent import SGD
+
+
+class MyW2VTransformer(W2VTransformer):
+    """Some custom w2v transformer."""
+
+    def partial_fit(self, X):
+        # pylint: disable=useless-super-delegation
+        super().partial_fit([X])
+
+    def fit(self, X, y=None):
+        X = X.iloc[:, 0].tolist()
+        return super().fit([X], y)
+
+    def transform(self, words):
+        words = words.iloc[:, 0].tolist()
+        if self.gensim_model is None:
+            raise NotFittedError(
+                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
+            )
+
+        # The input as array of array
+        vectors = []
+        for word in words:
+            if word in self.gensim_model.wv:
+                vectors.append(self.gensim_model.wv[word])
+            else:
+                vectors.append(numpy.zeros(self.size))
+        return numpy.reshape(numpy.array(vectors), (len(words), self.size))
+
+
+def create_model(input_dim):
+    """Create a simple neural network"""
+    clf = Sequential()
+    clf.add(Dense(9, activation='relu', input_dim=input_dim))
+    clf.add(Dense(9, activation='relu'))
+    clf.add(Dense(2, activation='softmax'))
+    clf.compile(loss='categorical_crossentropy', optimizer=SGD(), metrics=["accuracy"])
+    return clf
@@ -0,0 +1,63 @@
+"""
+An example pipeline
+"""
+import os
+
+import pandas as pd
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
+from demo.healthcare.demo_utils import MyW2VTransformer, create_model
+from mlinspect.utils import get_project_root
+
+COUNTIES_OF_INTEREST = ['county2', 'county3']
+
+# load input data sources (data generated with https://www.mockaroo.com as a single file and then split into two)
+patients = pd.read_csv(os.path.join(str(get_project_root()), "demo", "healthcare", "healthcare_patients.csv"), na_values='?')
+histories = pd.read_csv(os.path.join(str(get_project_root()), "demo", "healthcare", "healthcare_histories.csv"),
+                        na_values='?')
+
+# combine input data into a single table
+data = patients.merge(histories, on=['ssn'])
+
+# compute mean complications per age group, append as column
+complications = data.groupby('age_group').agg(mean_complications=('complications', 'mean'))
+
+data = data.merge(complications, on=['age_group'])
+
+# target variable: people with a high number of complications
+data['label'] = data['complications'] > 1.2 * data['mean_complications']
+
+# project data to a subset of attributes
+data = data[['smoker', 'last_name', 'county', 'num_children', 'race', 'income', 'label']]
+
+# filter data
+data = data[data['county'].isin(COUNTIES_OF_INTEREST)]
+
+# define the feature encoding of the data
+impute_and_one_hot_encode = Pipeline([
+        ('impute', SimpleImputer(strategy='most_frequent')),
+        ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
+    ])
+
+featurisation = ColumnTransformer(transformers=[
+    ("impute_and_one_hot_encode", impute_and_one_hot_encode, ['smoker', 'county', 'race']),
+    ('word2vec', MyW2VTransformer(min_count=2), ['last_name']),
+    ('numeric', StandardScaler(), ['num_children', 'income'])
+])
+
+# define the training pipeline for the model
+neural_net = KerasClassifier(build_fn=create_model, epochs=10, batch_size=1, verbose=0, input_dim=109)
+pipeline = Pipeline([
+    ('features', featurisation),
+    ('learner', neural_net)])
+
+# train-test split
+train_data, test_data = train_test_split(data, random_state=0)
+# model training
+model = pipeline.fit(train_data, train_data['label'])
+# model evaluation
+print(model.score(test_data, test_data['label']))