VisActor · da730 · Feb 21, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024
diff --git a/packages/vmind/__tests__/browser/src/constants/mockData.ts b/packages/vmind/__tests__/browser/src/constants/mockData.ts
@@ -3625,15 +3625,14 @@ Xiaomi,0.1,4087,131345
  * 雷达图
  */
 export const mockUserInput14 = {
-  csv: `key,value
+  csv: `dimension,performance
 Strength,5
 Speed,5
 Shooting,3
 Endurance,5
 Precision,5
-Growth,5
-  `,
-  input: '帮我展示个人在不同方面的绩效，他是否是六边形战士'
+Growth,5`,
+  input: '帮我展示这个人在不同方面的绩效，他是否是六边形战士'
 };
 
 /**

diff --git a/packages/vmind/bundler.config.js b/packages/vmind/bundler.config.js
@@ -4,11 +4,10 @@
 const json = require('@rollup/plugin-json');
 
 module.exports = {
-  formats: ['cjs', 'es', 'umd'],
+  formats: ['cjs', 'es'],
   outputDir: {
     es: 'esm',
     cjs: 'cjs',
-    umd: 'build'
   },
   name: 'VMind',
   umdOutputFilename: 'index',

diff --git a/packages/vmind/src/chart-to-video/index.ts b/packages/vmind/src/chart-to-video/index.ts
@@ -7,7 +7,8 @@ export async function _chatToVideoWasm(
   propsSpec: any,
   propsTime: TimeType,
   outName = 'out',
-  outerPackages: OuterPackages
+  outerPackages: OuterPackages,
+  mode?: 'node' | 'desktop-browser'
 ) {
   const { ManualTicker, defaultTimeline, VChart, fetchFile, FFmpeg, createCanvas } = outerPackages;
 
@@ -31,7 +32,7 @@ export async function _chatToVideoWasm(
   const canvas = createCanvas(width, height);
   const vchart = new VChart(spec, {
     renderCanvas: canvas,
-    mode: 'desktop-browser',
+    mode: 'node',
     dpr: 1,
     disableDirtyBounds: true,
     ticker: defaultTicker,
@@ -74,28 +75,33 @@ export async function _chatToVideoWasm(
     vchart.getStage().render();
     const num = `0000${i}`.slice(-3);
 
-    const size = { width: canvas.width, height: canvas.height };
-    const blob = await new Promise((resolve, reject) => {
-      canvas.toBlob((blob: any) => {
-        if (blob) {
-          const info = {
-            data: blob,
-            format: 'PNG',
-            size
-          };
-          console.log(`BBB--------${info}`);
-          resolve(info);
-        } else {
-          console.log('no blob');
-          reject('no blob');
-        }
-      }, `image/png`);
-    });
+    if (mode === 'node') {
+      const buffer = (canvas as any).toBuffer();
+      FFmpeg.FS('writeFile', `vchart${idx}.${num}.png`, buffer);
+    } else {
+      const size = { width: canvas.width, height: canvas.height };
+      const blob = await new Promise((resolve, reject) => {
+        canvas.toBlob((blob: any) => {
+          if (blob) {
+            const info = {
+              data: blob,
+              format: 'PNG',
+              size
+            };
+            console.log(`BBB--------${info}`);
+            resolve(info);
+          } else {
+            console.log('no blob');
+            reject('no blob');
+          }
+        }, `image/png`);
+      });
+      FFmpeg.FS('writeFile', `vchart${idx}.${num}.png`, await fetchFile((blob as any).data));
+    }
 
     // defaultTicker.mode = 'raf'
     // const imageData = ctx.getImageData(0, 0, ctx.canvas.width, ctx.canvas.height);
     // console.log(new Uint8Array(imageData.data.buffer))
-    FFmpeg.FS('writeFile', `vchart${idx}.${num}.png`, await fetchFile((blob as any).data));
   }
 
   vchart.release();

diff --git a/packages/vmind/src/common/dataProcess/index.ts b/packages/vmind/src/common/dataProcess/index.ts
@@ -15,8 +15,6 @@ export const parseCSVWithVChart = (csvString: string) => {
   return dataView;
 };
 
-export const getDataView = (dataset: DataItem[]) => {};
-
 export const getDataset = (csvString: string): { dataset: DataItem[]; columns: string[] } => {
   //get dataset from csv string
   const dataView = parseCSVWithVChart(csvString);

diff --git a/packages/vmind/src/common/dataProcess/utils.ts b/packages/vmind/src/common/dataProcess/utils.ts
@@ -1,6 +1,7 @@
 import { sampleSize, isNumber, isInteger } from 'lodash';
 import { DataItem, DataType, ROLE, SimpleFieldInfo } from '../../typings';
 import dayjs from 'dayjs';
+import { uniqArray } from '@visactor/vutils';
 export const readTopNLine = (csvFile: string, n: number) => {
   // get top n lines of a csv file
   let res = '';
@@ -29,13 +30,18 @@ function validateDate(date: any) {
   return dayjs(date, 'YYYY-MM-DD').isValid() || dayjs(date, 'MM-DD').isValid();
 }
 
+export function removeEmptyLines(str: string) {
+  return str.replace(/\n\s*\n/g, '\n');
+}
+
 export const detectFieldType = (dataset: DataItem[], column: string): SimpleFieldInfo => {
   let fieldType: DataType | undefined = undefined;
   //detect field type based on rules
   //The data types have the following inclusion relationships:
   //date=>string
   //int=>float=>string
   //detect field type from strict to loose
+
   dataset.every(data => {
     const value = data[column];
     const numberValue = Number(value);
@@ -88,10 +94,19 @@ export const detectFieldType = (dataset: DataItem[], column: string): SimpleFiel
       return true;
     }
   });
+  const role = [DataType.STRING, DataType.DATE].includes(fieldType) ? ROLE.DIMENSION : ROLE.MEASURE;
+
+  //calculate domain of the column
+  const domain: (string | number)[] = dataset.map(d => (role === ROLE.DIMENSION ? d[column] : Number(d[column])));
+
   return {
     fieldName: column,
     type: fieldType,
-    role: [DataType.STRING, DataType.DATE].includes(fieldType) ? ROLE.DIMENSION : ROLE.MEASURE
+    role,
+    domain:
+      role === ROLE.DIMENSION
+        ? (uniqArray(domain) as string[]).slice(0, 20)
+        : [Math.min(...(domain as number[])), Math.max(...(domain as number[]))]
   };
 };
 export const getFieldInfoFromDataset = (dataset: DataItem[], columns: string[]): SimpleFieldInfo[] => {

diff --git a/packages/vmind/src/core/VMind.ts b/packages/vmind/src/core/VMind.ts
@@ -96,18 +96,18 @@ class VMind {
     return { fieldInfo: [], dataset };
   }
 
-  async exportVideo(spec: any, time: TimeType, outerPackages: OuterPackages) {
+  async exportVideo(spec: any, time: TimeType, outerPackages: OuterPackages, mode?: 'node' | 'desktop-browser') {
     const { VChart, FFmpeg, fetchFile, ManualTicker } = outerPackages;
     const outName = `out`;
-    await _chatToVideoWasm(this._FPS, spec, time, outName, outerPackages);
+    await _chatToVideoWasm(this._FPS, spec, time, outName, outerPackages, mode);
     const data = FFmpeg.FS('readFile', `${outName}.mp4`);
     return data.buffer;
   }
 
-  async exportGIF(spec: any, time: TimeType, outerPackages: OuterPackages) {
+  async exportGIF(spec: any, time: TimeType, outerPackages: OuterPackages, mode?: 'node' | 'desktop-browser') {
     const { VChart, FFmpeg, fetchFile } = outerPackages;
     const outName = `out`;
-    await _chatToVideoWasm(this._FPS, spec, time, outName, outerPackages);
+    await _chatToVideoWasm(this._FPS, spec, time, outName, outerPackages, mode);
     // 调色板
     await FFmpeg.run('-i', `${outName}.mp4`, '-filter_complex', '[0:v] palettegen', 'palette.png');
     await FFmpeg.run(

diff --git a/packages/vmind/src/gpt/chart-generation/utils.ts b/packages/vmind/src/gpt/chart-generation/utils.ts
@@ -39,6 +39,16 @@ export const patchChartTypeAndCell = (chartTypeOutter: string, cell: any, datase
   const { x, y } = cell;
 
   let chartType = chartTypeOutter;
+
+  // patch the "axis" field to x
+  if (cell.axis && (!cell.x || !cell.y)) {
+    if (!cell.x) {
+      cell.x = cell.axis;
+    } else if (!cell.y) {
+      cell.y = cell.axis;
+    }
+  }
+
   // y轴字段有多个时，处理方式:
   // 1. 图表类型为: 箱型图, 图表类型不做矫正
   // 2. 图表类型为: 柱状图 或 折线图, 图表类型矫正为双轴图
@@ -77,13 +87,20 @@ export const patchChartTypeAndCell = (chartTypeOutter: string, cell: any, datase
       const {
         lower_whisker,
         lowerWhisker,
-        lowerBox,
         min,
         lower,
+        lowerBox,
+        lower_box,
         q1,
+        lower_quartile,
+        lowerQuartile,
+        midline,
         median,
         q3,
         upperBox,
+        upper_box,
+        upper_quartile,
+        upperQuartile,
         upper_whisker,
         upperWhisker,
         max,
@@ -98,15 +115,22 @@ export const patchChartTypeAndCell = (chartTypeOutter: string, cell: any, datase
             lowerWhisker,
             min,
             lower,
-            q1,
             lowerBox,
+            lower_box,
+            q1,
+            lower_quartile,
+            lowerQuartile,
+            midline,
             median,
-            upperBox,
             q3,
+            upperBox,
+            upper_box,
+            upper_quartile,
+            upperQuartile,
             upper_whisker,
+            upperWhisker,
             max,
-            upper,
-            upperWhisker
+            upper
           ].filter(Boolean)
         }
       };

diff --git a/packages/vmind/src/gpt/dataProcess/prompts.ts b/packages/vmind/src/gpt/dataProcess/prompts.ts
@@ -195,78 +195,51 @@ Response:
 
 export const getQueryDatasetPrompt = (
   showThoughts: boolean
-) => `You are an expert in data analysis. Here is a raw dataset named dataSource. User will tell you his command and column information of DataSource. You need to generate a standard SQL query to select useful fields from dataSource according to the template following the Steps and Description. Return the JSON object only.
-# Note
-1. You are running on a simple SQL engine, so the advanced features, such as RANK() OVER, TOP, JOIN, UNION, etc., are not supported. Please follow the SQL template and Description strictly.
-2. Don't guess the specific data content in your SQL. Don't use conditional statement.
-3. If you think the fields in dataSource cannot meet user requirements, do not further generate new fields. Just ignore user's command and use these fields.
+) => `You are an expert in data analysis. Here is a raw dataset named dataSource. User will tell you his command and column information of dataSource. Your task is to generate SimQuery and fieldInfo according to SimQuery Instruction. Response one JSON object only.
 
+# SimQuery Instruction
+- SimQuery is a simplified SQL-like language. Supported keywords in SimQuery: ["SELECT", "FROM", "WHERE", "GROUP BY", "HAVING", "ORDER BY", "LIMIT"].
+- A SimQuery query looks like this: "SELECT columnA, SUM(columnB) as sum_b FROM dataSource WHERE columnA = value1 GROUP BY columnA HAVING sum_b>0 ORDER BY sum_b LIMIT 10".
+- Columns in SELECT can only be original columns or aggregated columns. Supported aggregation methods in SimQuery: ["MAX()", "MIN()", "SUM()", "COUNT()", "AVG()"].
+- The "WHERE" and "HAVING" in SimQuery can only use original columns or aggregated columns in dataSource. Supported Operators in SimQuery:[ ">", ">=", "<", "<=", "=", "!=", "in", "not in", "is null", "is not null", "between", "not between", "like", "not like"]. Don't use non-existent columns.
+- Don't use unsupported keywords such as CASE WHEN...ELSE...END or PERCENTILE_CONT. Don't use unsupported aggregation methods on columns. Don't use unsupported operators. Unsupported keywords, methods and operators will cause system crash. If current keywords and methods can't meet your needs, just simple select the column without any process.
+- Make your SimQuery as simple as possible.
 
-# SQL template:
-SELECT xxx FROM xxx (WHERE xxx) GROUP BY xxx (HAVING xxx) (ORDER BY xxx) (LIMIT xxx).
-
+You need to follow the steps below.
 
 # Steps
-1. Just use user's command to select useful fields directly. Ignore other parts of user's command.
-2. Select useful dimension fields from dataSource. Use the original dimension field without any process.
-3. Aggregate the measure fields. Supported aggregation function: MAX(), MIN(), SUM(), COUNT(), AVG(). Note: don't aggregate measures using functions that are not supported such as PERCENTILE_CONT(). Don't use conditional statement.
-4. Group the data using dimension fields and fill it in GROUP BY.
-5. You can also use WHERE, HAVING, ORDER BY, LIMIT in your SQL if necessary.
-
-
-# Description
-1. The part in brackets is optional. xxx in the SQL template can only be original columns or aggregated columns. Select Data only from one table. Don't use unsupported features such as RANK(), TOP, UNION, etc.
-2. Make your SQL as simple as possible. Strictly follow the SQL template to generate SQL. Don't use JOIN, UNION, subquery or other feature that is not in the SQL template. Don't process fields in ways other than supported aggregation functions.
-3. Please don't change or translate the field names in your SQL statement.
-4. Don't ignore GROUP BY in your SQL.
+1. Extract the part related to the data from the user's instruction. Ignore other parts that is not related to the data.
+2. Select useful dimension and measure columns from dataSource. You can only use columns in Column Information and do not assume non-existent columns. If the existing columns can't meet user's command, just select the most related columns in Column Information.
+3. Use the original dimension columns without any process. Aggregate the measure columns using aggregation methods supported in SimQuery. Don't use unsupported methods. If current keywords and methods can't meet your needs, just simple select the column without any process.
+4. Group the data using dimension columns.
+5. You can also use WHERE, HAVING, ORDER BY, LIMIT in your SimQuery if necessary. Use the supported operators to finish the WHERE and HAVING of SimQuery. You can only use binary expression such as columnA = value1, sum_b > 0. You can only use dimension values appearing in the domain of dimension columns in your expression.
 
+Let's think step by step.
 
-Response in JSON format without any additional words. Your JSON object must contain sql and fieldInfo.
+Response one JSON object without any additional words. Your JSON object must contain SimQuery and fieldInfo.
 
-Make your SQL as simple as possible.
-
-Response in the following JSON format:
+Response in the following format:
 \`\`\`
 {
-sql: string; //your sql statement. Note that it's a string in a JSON object so it must be in one line without any \\n.
-fieldInfo: {
-  fieldName: string; //name of the field.
-  description?: string; //description of the field. If it is an aggregated field, please describe how it is generated in detail.
-}[]; //array of the information about the fields in your sql. Describing its aggregation method and other information of the fields.
+  ${showThoughts ? 'THOUGHTS: string //your thoughts' : ''}
+  SimQuery: string; //your SimQuery query. Note that it's a string in a JSON object so it must be in one line without any \\n.
+  fieldInfo: {
+    fieldName: string; //name of the field.
+    description?: string; //description of the field. If it is an aggregated field, please describe how it is generated in detail.
+  }[]; //array of the information about the fields in your SimQuery. Describing its aggregation method and other information of the fields.
 }
 \`\`\`
 
 #Examples:
 
-User's Command: 帮我展示个人在不同方面的绩效，他是否是六边形战士
-Column Information: [{"fieldName":"key","type":"string","role":"dimension"},{"fieldName":"value","type":"int","role":"measure"}]
-
-Response:
-\`\`\`
-
-{
-  "sql": "SELECT key, SUM(value) AS performance FROM dataSource GROUP BY key",
-  "fieldInfo": [
-    {
-      "fieldName": "key",
-      "description": "The identifier of the person."
-    },
-    {
-      "fieldName": "performance",
-      "description": "An aggregated field representing the performance of the person in different aspects. It is generated by aggregating the 'value' field."
-    }
-  ]
-}
-\`\`\`
-----------------------------------
-
 User's Command: Show me the change of the GDP rankings of each country.
 Column Information: [{"fieldName":"country","type":"string","role":"dimension"},{"fieldName":"continent","type":"string","role":"dimension"},{"fieldName":"GDP","type":"float","role":"measure"},{"fieldName":"year","type":"int","role":"measure"}]
 
 Response:
 \`\`\`
 {
-  "sql": "SELECT country, year, SUM(GDP) AS total_GDP FROM dataSource GROUP BY country, year ORDER BY year, total_GDP DESC",
+  ${showThoughts ? '"THOUGHTS": string //your thoughts' : ''}
+  "SimQuery": "SELECT country, year, SUM(GDP) AS total_GDP FROM dataSource GROUP BY country, year ORDER BY year, total_GDP DESC",
   "fieldInfo": [
     {
       "fieldName": "country",
@@ -291,7 +264,8 @@ Column Information: [{"fieldName":"城市","type":"string","role":"dimension"},{
 Response:
 \`\`\`
 {
-  "sql": "SELECT 城市, SUM(\`2022年GDP（亿元）\`) as sum_2022_GDP FROM dataSource ORDER BY sum_2022_GDP DESC LIMIT 5",
+  ${showThoughts ? '"THOUGHTS": string //your thoughts' : ''}
+  "SimQuery": "SELECT 城市, SUM(\`2022年GDP（亿元）\`) as sum_2022_GDP FROM dataSource ORDER BY sum_2022_GDP DESC LIMIT 5",
   "fieldInfo": [
     {
       "fieldName": "城市",
@@ -312,7 +286,8 @@ Column Information: [{"fieldName":"时间","type":"string","role":"dimension"},{
 Response:
 \`\`\`
 {
-  "sql": "SELECT \`时间\`, SUM(\`男_DASH_早餐\`) AS breakfast_amount_man, SUM(\`女_DASH_早餐\`) AS breakfast_amount_woman FROM dataSource GROUP BY \`时间\`",
+  ${showThoughts ? '"THOUGHTS": string //your thoughts' : ''}
+  "SimQuery": "SELECT \`时间\`, SUM(\`男_DASH_早餐\`) AS breakfast_amount_man, SUM(\`女_DASH_早餐\`) AS breakfast_amount_woman FROM dataSource GROUP BY \`时间\`",
   "fieldInfo": [
     {
       "fieldName": "gender",
@@ -332,6 +307,7 @@ You only need to return the JSON in your response directly to the user.
 Finish your tasks in one-step.
 
 # Constraints:
-1. Write your SQL statement in one line without any \\n.
-2. Response the JSON object directly without any other contents. Make sure it can be directly parsed by JSON.parse() in JavaScript.
+1. Write your SimQuery statement in one line without any \\n.
+2. Please don't change or translate the field names in your SimQuery statement. Don't miss the GROUP BY in your query.
+3. Response the JSON object directly without any other contents. Make sure it can be directly parsed by JSON.parse() in JavaScript.
 `;