feat(SemanticLayerSchema): making column type optional and avoid dump…

…ing and parsing null values in the SemanticLayerSchema
sinaptik-ai · Jan 15, 2025 · f889c86 · f889c86
1 parent d370d17
commit f889c86
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 34 deletions.
diff --git a/docs/v3/contributing.mdx b/docs/v3/contributing.mdx
@@ -64,7 +64,7 @@ make spell_fix
 We use `pytest` to test our code. You can run the tests by running the following command:
 
 ```bash
-make tests
+make test_all
 ```
 
 Make sure that all tests pass before submitting a pull request.

diff --git a/docs/v3/semantic-layer.mdx b/docs/v3/semantic-layer.mdx
@@ -27,20 +27,22 @@ df.save(
     path="company/sales-data",         # Format: "organization/dataset"
     name="sales-data",                 # Human-readable name
     description="Sales data from our retail stores",  # Optional description
-    columns={
-        "transaction_id": {
+    columns=[
+        {
+            "name": "transaction_id",
             "type": "string",
             "description": "Unique identifier for each sale"
         },
-        "sale_date": {
+        {
+            "name": "sale_date"
             "type": "datetime",
             "description": "Date and time of the sale"
         }
-    }
+    ]
 )
 ```
 
-#### name 
+#### name
 
 The name field identifies your dataset in the save method.
 
@@ -84,28 +86,33 @@ df.save(
     path="company/sales-data",
     name="sales-data",
     description="Daily sales transactions from all retail stores",
-    columns={
-        "transaction_id": {
+    columns=[
+        {
+            "name": "transaction_id",
             "type": "string",
             "description": "Unique identifier for each sale"
         },
-        "sale_date": {
+        {
+            "name": "sale_date"
             "type": "datetime",
             "description": "Date and time of the sale"
         },
-        "quantity": {
+        {
+            "name": "quantity",
             "type": "integer",
             "description": "Number of units sold"
         },
-        "price": {
+        {
+            "name": "price",
             "type": "float",
             "description": "Price per unit in USD"
         },
-        "is_online": {
+        {
+            "name": "is_online",
             "type": "boolean",
             "description": "Whether the sale was made online"
         }
-    }
+    ]
 )
 ```
 
@@ -238,7 +245,7 @@ source:
 - `connection_string` (str): Connection string for the data source
 - `query` (str): Query to retrieve data from the data source
 
-
+{/* commented as destination and update frequency will be only in the materialized case
 #### destination (mandatory)
 Specify the destination for your dataset.
 
@@ -256,6 +263,7 @@ destination:
   path: /path/to/data
 ```
 
+
 #### update_frequency
 Specify the frequency of updates for your dataset.
 
@@ -268,6 +276,7 @@ Specify the frequency of updates for your dataset.
 ```yaml
 update_frequency: daily
 ```
+*/}
 
 #### order_by
 Specify the columns to order by.

diff --git a/pandasai/data_loader/loader.py b/pandasai/data_loader/loader.py
@@ -72,7 +72,7 @@ def _get_abs_dataset_path(self):
         return os.path.join(find_project_root(), "datasets", self.dataset_path)
 
     def _load_schema(self):
-        schema_path = os.path.join(self._get_abs_dataset_path(), "schema.yaml")
+        schema_path = os.path.join(str(self._get_abs_dataset_path()), "schema.yaml")
         if not os.path.exists(schema_path):
             raise FileNotFoundError(f"Schema file not found: {schema_path}")
 

diff --git a/pandasai/data_loader/semantic_layer_schema.py b/pandasai/data_loader/semantic_layer_schema.py
@@ -1,5 +1,5 @@
 import json
-from typing import Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import yaml
 from pydantic import (
@@ -19,21 +19,23 @@
 
 class Column(BaseModel):
     name: str = Field(..., description="Name of the column.")
-    type: str = Field(..., description="Data type of the column.")
+    type: Optional[str] = Field(None, description="Data type of the column.")
     description: Optional[str] = Field(None, description="Description of the column")
 
     @field_validator("type")
     @classmethod
     def is_column_type_supported(cls, type: str) -> str:
         if type not in VALID_COLUMN_TYPES:
-            raise ValueError(f"Unsupported column type: {type}")
+            raise ValueError(
+                f"Unsupported column type: {type}. Supported types are: {VALID_COLUMN_TYPES}"
+            )
         return type
 
 
 class Transformation(BaseModel):
     type: str = Field(..., description="Type of transformation to be applied.")
-    params: Dict[str, str] = Field(
-        ..., description="Parameters for the transformation."
+    params: Optional[Dict[str, str]] = Field(
+        None, description="Parameters for the transformation."
     )
 
     @field_validator("type")
@@ -95,13 +97,13 @@ def is_format_supported(cls, format: str) -> str:
 
 class SemanticLayerSchema(BaseModel):
     name: str = Field(..., description="Dataset name.")
+    source: Source = Field(..., description="Data source for your dataset.")
     description: Optional[str] = Field(
         None, description="Dataset’s contents and purpose description."
     )
     columns: Optional[List[Column]] = Field(
         None, description="Structure and metadata of your dataset’s columns"
     )
-    source: Source = Field(..., description="Data source for your dataset.")
     order_by: Optional[List[str]] = Field(
         None, description="Ordering criteria for the dataset."
     )
@@ -118,8 +120,11 @@ class SemanticLayerSchema(BaseModel):
         None, description="Frequency of dataset updates."
     )
 
+    def to_dict(self) -> dict[str, Any]:
+        return self.model_dump(exclude_none=True)
+
     def to_yaml(self) -> str:
-        return yaml.dump(self.model_dump(), sort_keys=False)
+        return yaml.dump(self.to_dict(), sort_keys=False)
 
 
 def is_schema_source_same(

diff --git a/pandasai/dataframe/base.py b/pandasai/dataframe/base.py
@@ -176,22 +176,23 @@ def _create_yml_template(
             columns_dict: dictionary with info about columns of the dataframe
         """
 
-        columns = list(map(lambda column: Column(**column), columns_dict))
+        if columns_dict:
+            columns_dict = list(map(lambda column: Column(**column), columns_dict))
 
         schema = SemanticLayerSchema(
             name=name,
             description=description,
-            columns=columns,
+            columns=columns_dict,
             source=Source(type="parquet", path="data.parquet"),
             destination=Destination(
                 type="local", format="parquet", path="data.parquet"
             ),
         )
 
-        return schema.model_dump()
+        return schema.to_dict()
 
     def save(
-        self, path: str, name: str, description: str = None, columns: List[dict] = []
+        self, path: str, name: str, description: str = None, columns: List[dict] = None
     ):
         self.name = name
         self.description = description

diff --git a/tests/unit_tests/dataframe/test_dataframe.py b/tests/unit_tests/dataframe/test_dataframe.py
@@ -112,24 +112,18 @@ def test_save_creates_correct_schema(self, sample_df):
                 "name": name,
                 "description": description,
                 "columns": [
-                    {"name": "Name", "type": "string", "description": None},
-                    {"name": "Age", "type": "integer", "description": None},
+                    {"name": "Name", "type": "string"},
+                    {"name": "Age", "type": "integer"},
                 ],
                 "destination": {
                     "format": "parquet",
                     "path": "data.parquet",
                     "type": "local",
                 },
-                "limit": None,
-                "order_by": None,
                 "source": {
-                    "connection": None,
                     "path": "data.parquet",
-                    "table": None,
                     "type": "parquet",
                 },
-                "transformations": None,
-                "update_frequency": None,
             }
 
             mock_yaml_dump.assert_called_once_with(