-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzotero.bib
7808 lines (7145 loc) · 649 KB
/
zotero.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{elavsky_data_2023,
title = {Data {Navigator}: {An} {Accessibility}-{Centered} {Data} {Navigation} {Toolkit}},
volume = {30},
issn = {1077-2626},
shorttitle = {Data {Navigator}},
url = {https://doi.org/10.1109/TVCG.2023.3327393},
doi = {10.1109/TVCG.2023.3327393},
abstract = {Making data visualizations accessible for people with disabilities remains a significant challenge in current practitioner efforts. Existing visualizations often lack an underlying navigable structure, fail to engage necessary input modalities, and rely heavily on visual-only rendering practices. These limitations exclude people with disabilities, especially users of assistive technologies. To address these challenges, we present Data Navigator: a system built on a dynamic graph structure, enabling developers to construct navigable lists, trees, graphs, and flows as well as spatial, diagrammatic, and geographic relations. Data Navigator supports a wide range of input modalities: screen reader, keyboard, speech, gesture detection, and even fabricated assistive devices. We present 3 case examples with Data Navigator, demonstrating we can provide accessible navigation structures on top of raster images, integrate with existing toolkits at scale, and rapidly develop novel prototypes. Data Navigator is a step towards making accessible data visualizations easier to design and implement.},
number = {1},
urldate = {2024-10-25},
journal = {IEEE Transactions on Visualization and Computer Graphics},
author = {Elavsky, Frank and Nadolskis, Lucas and Moritz, Dominik},
month = oct,
year = {2023},
pages = {803--813},
}
@article{author_study_nodate,
title = {Study on {Differences} in {UI} {Resource} {Identifiers} {Across} {Different} {Cultures}},
language = {en},
author = {Author, Anonymous},
}
@misc{noauthor_summary_nodate,
title = {Summary of {Shriram} {Comments}},
url = {https://docs.google.com/document/u/0/d/1bTJSs0dU3MIjt7HTsEUnJ1rKv6WlmYtzeLpTFElM6t0/edit?usp=embed_facebook},
language = {en},
urldate = {2024-10-23},
journal = {Google Docs},
}
@inproceedings{pan_human-computer_2023,
address = {New York, NY, USA},
series = {{CHI} '23},
title = {A {Human}-{Computer} {Collaborative} {Editing} {Tool} for {Conceptual} {Diagrams}},
isbn = {978-1-4503-9421-5},
url = {https://dl.acm.org/doi/10.1145/3544548.3580676},
doi = {10.1145/3544548.3580676},
abstract = {Editing (e.g., editing conceptual diagrams) is a typical office task that requires numerous tedious GUI operations, resulting in poor interaction efficiency and user experience, especially on mobile devices. In this paper, we present a new type of human-computer collaborative editing tool (CET) that enables accurate and efficient editing with little interaction effort. CET divides the task into two parts, and the human and the computer focus on their respective specialties: the human describes high-level editing goals with multimodal commands, while the computer calculates, recommends, and performs detailed operations. We conducted a formative study (N = 16) to determine the concrete task division and implemented the tool on Android devices for the specific tasks of editing concept diagrams. The user study (N = 24 + 20) showed that it increased diagram editing speed by 32.75\% compared with existing state-of-the-art commercial tools and led to better editing results and user experience.},
urldate = {2024-10-21},
booktitle = {Proceedings of the 2023 {CHI} {Conference} on {Human} {Factors} in {Computing} {Systems}},
publisher = {Association for Computing Machinery},
author = {Pan, Lihang and Yu, Chun and He, Zhe and Shi, Yuanchun},
month = apr,
year = {2023},
pages = {1--29},
}
@inproceedings{clark_homotopy_2023,
address = {New York, NY, USA},
series = {{FARM} 2023},
title = {Homotopy {Type} {Theory} for {Sewn} {Quilts}},
isbn = {9798400702952},
url = {https://dl.acm.org/doi/10.1145/3609023.3609803},
doi = {10.1145/3609023.3609803},
abstract = {This paper introduces PieceWork, an imperative programming language for the construction of designs for sewn quilts, whose semantics are inspired by Homotopy Type Theory. The goals of PieceWork include improving the diversity of sewn designs that can be represented in computational methods, demonstrating a creative application of Homotopy Type Theory, and demonstrating that the craft of quilting is a worthy object of study in programming language theory.
We develop an operational semantics, provide a prototype implementation and examples, and provide initial theoretical results. Type system design is in-progress.},
urldate = {2024-10-21},
booktitle = {Proceedings of the 11th {ACM} {SIGPLAN} {International} {Workshop} on {Functional} {Art}, {Music}, {Modelling}, and {Design}},
publisher = {Association for Computing Machinery},
author = {Clark, Charlotte and Bohrer, Rose},
month = aug,
year = {2023},
pages = {32--43},
}
@article{li_imesh_2024,
title = {{IMESH}: {A} {DSL} for {Mesh} {Processing}},
volume = {43},
issn = {0730-0301},
shorttitle = {{IMESH}},
url = {https://doi.org/10.1145/3662181},
doi = {10.1145/3662181},
abstract = {Mesh processing algorithms are often communicated via concise mathematical notation (e.g., summation over mesh neighborhoods). However, conversion of notation into working code remains a time-consuming and error-prone process, which requires arcane knowledge of low-level data structures and libraries—impeding rapid exploration of high-level algorithms. We address this problem by introducing a domain-specific language (DSL) for mesh processing called IMESH, which resembles notation commonly used in visual and geometric computing and automates the process of converting notation into code. The centerpiece of our language is a flexible notation for specifying and manipulating neighborhoods of a cell complex, internally represented via standard operations on sparse boundary matrices. This layered design enables natural expression of algorithms while minimizing demands on a code generation backend. In particular, by integrating IMESH with the linear algebra features of the ILA DSL and adding support for automatic differentiation, we can rapidly implement a rich variety of algorithms on point clouds, surface meshes, and volume meshes.},
number = {5},
urldate = {2024-10-21},
journal = {ACM Trans. Graph.},
author = {Li, Yong and Kamil, Shoaib and Crane, Keenan and Jacobson, Alec and Gingold, Yotam},
month = jun,
year = {2024},
pages = {154:1--154:17},
}
@article{li_i_2021,
title = {I♥{LA}: compilable markdown for linear algebra},
volume = {40},
issn = {0730-0301},
shorttitle = {I♥{LA}},
url = {https://dl.acm.org/doi/10.1145/3478513.3480506},
doi = {10.1145/3478513.3480506},
abstract = {Communicating linear algebra in written form is challenging: mathematicians must choose between writing in languages that produce well-formatted but semantically-underdefined representations such as LaTeX; or languages with well-defined semantics but notation unlike conventional math, such as C++/Eigen. In both cases, the underlying linear algebra is obfuscated by the requirements of esoteric language syntax (as in LaTeX) or awkward APIs due to language semantics (as in C++). The gap between representations results in communication challenges, including underspecified and irrepro-ducible research results, difficulty teaching math concepts underlying complex numerical code, as well as repeated, redundant, and error-prone translations from communicated linear algebra to executable code. We introduce I♥LA, a language with syntax designed to closely mimic conventionally-written linear algebra, while still ensuring an unambiguous, compilable interpretation. Inspired by Markdown, a language for writing naturally-structured plain text files that translate into valid HTML, I♥LA allows users to write linear algebra in text form and compile the same source into LaTeX, C++/Eigen, Python/NumPy/SciPy, and MATLAB, with easy extension to further math programming environments. We outline the principles of our language design and highlight design decisions that balance between readability and precise semantics, and demonstrate through case studies the ability for I♥LA to bridge the semantic gap between conventionally-written linear algebra and unambiguous interpretation in math programming environments.},
number = {6},
urldate = {2024-10-21},
journal = {ACM Trans. Graph.},
author = {Li, Yong and Kamil, Shoaib and Jacobson, Alec and Gingold, Yotam},
month = dec,
year = {2021},
pages = {264:1--264:14},
}
@article{li_i_2021-1,
title = {I♥{LA}: compilable markdown for linear algebra},
volume = {40},
issn = {0730-0301},
shorttitle = {I♥{LA}},
url = {https://dl.acm.org/doi/10.1145/3478513.3480506},
doi = {10.1145/3478513.3480506},
abstract = {Communicating linear algebra in written form is challenging: mathematicians must choose between writing in languages that produce well-formatted but semantically-underdefined representations such as LaTeX; or languages with well-defined semantics but notation unlike conventional math, such as C++/Eigen. In both cases, the underlying linear algebra is obfuscated by the requirements of esoteric language syntax (as in LaTeX) or awkward APIs due to language semantics (as in C++). The gap between representations results in communication challenges, including underspecified and irrepro-ducible research results, difficulty teaching math concepts underlying complex numerical code, as well as repeated, redundant, and error-prone translations from communicated linear algebra to executable code. We introduce I♥LA, a language with syntax designed to closely mimic conventionally-written linear algebra, while still ensuring an unambiguous, compilable interpretation. Inspired by Markdown, a language for writing naturally-structured plain text files that translate into valid HTML, I♥LA allows users to write linear algebra in text form and compile the same source into LaTeX, C++/Eigen, Python/NumPy/SciPy, and MATLAB, with easy extension to further math programming environments. We outline the principles of our language design and highlight design decisions that balance between readability and precise semantics, and demonstrate through case studies the ability for I♥LA to bridge the semantic gap between conventionally-written linear algebra and unambiguous interpretation in math programming environments.},
number = {6},
urldate = {2024-10-12},
journal = {ACM Trans. Graph.},
author = {Li, Yong and Kamil, Shoaib and Jacobson, Alec and Gingold, Yotam},
month = dec,
year = {2021},
pages = {264:1--264:14},
}
@incollection{avigad_design_2020,
address = {Cham},
title = {The {Design} of {Mathematical} {Language}},
isbn = {978-3-030-19071-2},
url = {https://doi.org/10.1007/978-3-030-19071-2_64-1},
abstract = {As idealized descriptions of mathematical language, there is a sense in which formal systems specify too little, and there is a sense in which they specify too much. On the one hand, formal languages fail to account for a number of features of informal mathematical language that are essential to the communicative and inferential goals of the subject. On the other hand, many of these features are independent of the choice of a formal foundation, so grounding their analysis on a particular choice of a formal system introduces unnecessary specificity. This chapter begins to map out the design features of mathematical language without descending to the level of formal implementation, drawing on examples from the mathematical literature and insights from the design of computational proof assistants and their libraries.},
language = {en},
urldate = {2024-10-10},
booktitle = {Handbook of the {History} and {Philosophy} of {Mathematical} {Practice}},
publisher = {Springer International Publishing},
author = {Avigad, Jeremy},
editor = {Sriraman, Bharath},
year = {2020},
doi = {10.1007/978-3-030-19071-2_64-1},
keywords = {Formalization, Mathematical language, Proof assistants},
pages = {1--39},
}
@book{stoll_set_2012,
title = {Set {Theory} and {Logic}},
isbn = {978-0-486-13964-7},
abstract = {Set Theory and Logic is the result of a course of lectures for advanced undergraduates, developed at Oberlin College for the purpose of introducing students to the conceptual foundations of mathematics. Mathematics, specifically the real number system, is approached as a unity whose operations can be logically ordered through axioms. One of the most complex and essential of modern mathematical innovations, the theory of sets (crucial to quantum mechanics and other sciences), is introduced in a most careful concept manner, aiming for the maximum in clarity and stimulation for further study in set logic. Contents include: Sets and Relations — Cantor\&\#39;s concept of a set, etc.Natural Number Sequence — Zorn\&\#39;s Lemma, etc.Extension of Natural Numbers to Real NumbersLogic — the Statement and Predicate Calculus, etc.Informal Axiomatic MathematicsBoolean AlgebraInformal Axiomatic Set TheorySeveral Algebraic Theories — Rings, Integral Domains, Fields, etc.First-Order Theories — Metamathematics, etc.Symbolic logic does not figure significantly until the final chapter. The main theme of the book is mathematics as a system seen through the elaboration of real numbers; set theory and logic are seen s efficient tools in constructing axioms necessary to the system. Mathematics students at the undergraduate level, and those who seek a rigorous but not unnecessarily technical introduction to mathematical concepts, will welcome the return to print of this most lucid work. \"Professor Stoll . . . has given us one of the best introductory texts we have seen.\" — Cosmos. \"In the reviewer\&\#39;s opinion, this is an excellent book, and in addition to its use as a textbook (it contains a wealth of exercises and examples) can be recommended to all who wish an introduction to mathematical logic less technical than standard treatises (to which it can also serve as preliminary reading).\" — Mathematical Reviews.},
language = {en},
publisher = {Courier Corporation},
author = {Stoll, Robert R.},
month = may,
year = {2012},
note = {Google-Books-ID: Qb6IAAAAQBAJ},
keywords = {Mathematics / Logic, Mathematics / Set Theory},
}
@article{wu_iconshop_2023,
title = {{IconShop}: {Text}-{Guided} {Vector} {Icon} {Synthesis} with {Autoregressive} {Transformers}},
volume = {42},
issn = {0730-0301},
shorttitle = {{IconShop}},
url = {https://dl.acm.org/doi/10.1145/3618364},
doi = {10.1145/3618364},
abstract = {Scalable Vector Graphics (SVG) is a popular vector image format that offers good support for interactivity and animation. Despite its appealing characteristics, creating custom SVG content can be challenging for users due to the steep learning curve required to understand SVG grammars or get familiar with professional editing software. Recent advancements in text-to-image generation have inspired researchers to explore vector graphics synthesis using either image-based methods (i.e., text → raster image → vector graphics) combining text-to-image generation models with image vectorization, or language-based methods (i.e., text → vector graphics script) through pretrained large language models. Nevertheless, these methods suffer from limitations in terms of generation quality, diversity, and flexibility. In this paper, we introduce IconShop, a text-guided vector icon synthesis method using autoregressive transformers. The key to success of our approach is to sequentialize and tokenize SVG paths (and textual descriptions as guidance) into a uniquely decodable token sequence. With that, we are able to exploit the sequence learning power of autoregressive transformers, while enabling both unconditional and text-conditioned icon synthesis. Through standard training to predict the next token on a large-scale vector icon dataset accompanied by textural descriptions, the proposed IconShop consistently exhibits better icon synthesis capability than existing image-based and language-based methods both quantitatively (using the FID and CLIP scores) and qualitatively (through formal subjective user studies). Meanwhile, we observe a dramatic improvement in generation diversity, which is validated by the objective Uniqueness and Novelty measures. More importantly, we demonstrate the flexibility of IconShop with multiple novel icon synthesis tasks, including icon editing, icon interpolation, icon semantic combination, and icon design auto-suggestion.},
number = {6},
urldate = {2024-10-02},
journal = {ACM Trans. Graph.},
author = {Wu, Ronghuan and Su, Wanchao and Ma, Kede and Liao, Jing},
month = dec,
year = {2023},
pages = {230:1--230:14},
}
@misc{liu_best_2024,
title = {Best {Practices} and {Lessons} {Learned} on {Synthetic} {Data}},
url = {https://arxiv.org/abs/2404.07503v2},
abstract = {The success of AI models relies on the availability of large, diverse, and high-quality datasets, which can be challenging to obtain due to data scarcity, privacy concerns, and high costs. Synthetic data has emerged as a promising solution by generating artificial data that mimics real-world patterns. This paper provides an overview of synthetic data research, discussing its applications, challenges, and future directions. We present empirical evidence from prior art to demonstrate its effectiveness and highlight the importance of ensuring its factuality, fidelity, and unbiasedness. We emphasize the need for responsible use of synthetic data to build more powerful, inclusive, and trustworthy language models.},
language = {en},
urldate = {2024-10-02},
journal = {arXiv.org},
author = {Liu, Ruibo and Wei, Jerry and Liu, Fangyu and Si, Chenglei and Zhang, Yanzhe and Rao, Jinmeng and Zheng, Steven and Peng, Daiyi and Yang, Diyi and Zhou, Denny and Dai, Andrew M.},
month = apr,
year = {2024},
}
@misc{huang_instahide_2020,
title = {{InstaHide}: {Instance}-hiding {Schemes} for {Private} {Distributed} {Learning}},
shorttitle = {{InstaHide}},
url = {https://arxiv.org/abs/2010.02772v2},
abstract = {How can multiple distributed entities collaboratively train a shared deep net on their private data while preserving privacy? This paper introduces InstaHide, a simple encryption of training images, which can be plugged into existing distributed deep learning pipelines. The encryption is efficient and applying it during training has minor effect on test accuracy. InstaHide encrypts each training image with a "one-time secret key" which consists of mixing a number of randomly chosen images and applying a random pixel-wise mask. Other contributions of this paper include: (a) Using a large public dataset (e.g. ImageNet) for mixing during its encryption, which improves security. (b) Experimental results to show effectiveness in preserving privacy against known attacks with only minor effects on accuracy. (c) Theoretical analysis showing that successfully attacking privacy requires attackers to solve a difficult computational problem. (d) Demonstrating that use of the pixel-wise mask is important for security, since Mixup alone is shown to be insecure to some some efficient attacks. (e) Release of a challenge dataset https://github.com/Hazelsuko07/InstaHide\_Challenge Our code is available at https://github.com/Hazelsuko07/InstaHide},
language = {en},
urldate = {2024-10-02},
journal = {arXiv.org},
author = {Huang, Yangsibo and Song, Zhao and Li, Kai and Arora, Sanjeev},
month = oct,
year = {2020},
}
@misc{wang_codeclm_2024,
title = {{CodecLM}: {Aligning} {Language} {Models} with {Tailored} {Synthetic} {Data}},
shorttitle = {{CodecLM}},
url = {https://arxiv.org/abs/2404.05875v1},
abstract = {Instruction tuning has emerged as the key in aligning large language models (LLMs) with specific task instructions, thereby mitigating the discrepancy between the next-token prediction objective and users' actual goals. To reduce the labor and time cost to collect or annotate data by humans, researchers start to explore the use of LLMs to generate instruction-aligned synthetic data. Recent works focus on generating diverse instructions and applying LLM to increase instruction complexity, often neglecting downstream use cases. It remains unclear how to tailor high-quality data to elicit better instruction-following abilities in different target instruction distributions and LLMs. To this end, we introduce CodecLM, a general framework for adaptively generating high-quality synthetic data for LLM alignment with different downstream instruction distributions and LLMs. Drawing on the Encode-Decode principles, we use LLMs as codecs to guide the data generation process. We first encode seed instructions into metadata, which are concise keywords generated on-the-fly to capture the target instruction distribution, and then decode metadata to create tailored instructions. We also introduce Self-Rubrics and Contrastive Filtering during decoding to tailor data-efficient samples. Extensive experiments on four open-domain instruction following benchmarks validate the effectiveness of CodecLM over the current state-of-the-arts.},
language = {en},
urldate = {2024-10-02},
journal = {arXiv.org},
author = {Wang, Zifeng and Li, Chun-Liang and Perot, Vincent and Le, Long T. and Miao, Jin and Zhang, Zizhao and Lee, Chen-Yu and Pfister, Tomas},
month = apr,
year = {2024},
}
@inproceedings{yuksekgonul_when_2022,
title = {When and {Why} {Vision}-{Language} {Models} {Behave} like {Bags}-{Of}-{Words}, and {What} to {Do} {About} {It}?},
url = {https://openreview.net/forum?id=KRLUvxh8uaX},
abstract = {Despite the success of large vision and language models (VLMs) in many downstream applications, it is unclear how well they encode the compositional relationships between objects and attributes. Here, we create the Attribution, Relation, and Order (ARO) benchmark to systematically evaluate the ability of VLMs to understand different types of relationships, attributes, and order information. ARO consists of {\textbackslash}emph\{Visual Genome Attribution\}, to test the understanding of objects' properties; {\textbackslash}emph\{Visual Genome Relation\}, to test for relational understanding; and {\textbackslash}emph\{COCO-Order {\textbackslash}\& Flickr30k-Order\}, to test for order sensitivity in VLMs. ARO is orders of magnitude larger than previous benchmarks of compositionality, with more than 50,000 test cases. We present the settings where state-of-the-art VLMs behave like bags-of-words---i.e. when they have poor relational understanding, can blunder when linking objects to their attributes, and demonstrate a severe lack of order sensitivity. VLMs are predominantly trained and evaluated on large scale datasets with rich compositional structure in the images and captions. Yet, training on these datasets has not been enough to address the lack of compositional understanding, and evaluating on these datasets has failed to surface this deficiency. To understand why these limitations emerge and are not represented in the standard tests, we zoom into the evaluation and training procedures. We demonstrate that it is possible to perform well on image-text retrieval over existing datasets without using the composition and order information. This further motivates the value of using ARO to benchmark VLMs. Given that contrastive pretraining optimizes for retrieval on large datasets with similar shortcuts, we hypothesize that this can explain why the models do not need to learn to represent compositional information. This finding suggests a natural solution: composition-aware hard negative mining. We show that a simple-to-implement modification of contrastive learning significantly improves the performance on tasks requiring understanding of order and compositionality.},
language = {en},
urldate = {2024-10-01},
author = {Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James},
month = sep,
year = {2022},
}
@misc{wei_gita_2024,
title = {{GITA}: {Graph} to {Visual} and {Textual} {Integration} for {Vision}-{Language} {Graph} {Reasoning}},
shorttitle = {{GITA}},
url = {http://arxiv.org/abs/2402.02130},
doi = {10.48550/arXiv.2402.02130},
abstract = {Large Language Models (LLMs) are increasingly used for various tasks with graph structures. Though LLMs can process graph information in a textual format, they overlook the rich vision modality, which is an intuitive way for humans to comprehend structural information and conduct general graph reasoning. The potential benefits and capabilities of representing graph structures as visual images (i.e., \${\textbackslash}textit\{visual graph\}\$) are still unexplored. To fill the gap, we innovatively propose an end-to-end framework, called \${\textbackslash}textbf\{G\}\$raph to v\${\textbackslash}textbf\{I\}\$sual and \${\textbackslash}textbf\{T\}\$extual Integr\${\textbackslash}textbf\{A\}\$tion (GITA), which firstly incorporates visual graphs into general graph reasoning. Besides, we establish \${\textbackslash}textbf\{G\}\$raph-based \${\textbackslash}textbf\{V\}\$ision-\${\textbackslash}textbf\{L\}\$anguage \${\textbackslash}textbf\{Q\}\$uestion \${\textbackslash}textbf\{A\}\$nswering (GVLQA) dataset from existing graph data, which is the first vision-language dataset for general graph reasoning purposes. Extensive experiments on the GVLQA dataset and five real-world datasets show that GITA outperforms mainstream LLMs in terms of general graph reasoning capabilities. Moreover, We highlight the effectiveness of the layout augmentation on visual graphs and pretraining on the GVLQA dataset.},
urldate = {2024-10-01},
publisher = {arXiv},
author = {Wei, Yanbin and Fu, Shuai and Jiang, Weisen and Zhang, Zejian and Zeng, Zhixiong and Wu, Qi and Kwok, James T. and Zhang, Yu},
month = may,
year = {2024},
note = {arXiv:2402.02130 [cs]},
keywords = {Computer Science - Computation and Language},
}
@misc{li_visiongraph_2024,
title = {{VisionGraph}: {Leveraging} {Large} {Multimodal} {Models} for {Graph} {Theory} {Problems} in {Visual} {Context}},
shorttitle = {{VisionGraph}},
url = {https://arxiv.org/abs/2405.04950v1},
abstract = {Large Multimodal Models (LMMs) have achieved impressive success in visual understanding and reasoning, remarkably improving the performance of mathematical reasoning in a visual context. Yet, a challenging type of visual math lies in the multimodal graph theory problem, which demands that LMMs understand the graphical structures accurately and perform multi-step reasoning on the visual graph. Additionally, exploring multimodal graph theory problems will lead to more effective strategies in fields like biology, transportation, and robotics planning. To step forward in this direction, we are the first to design a benchmark named VisionGraph, used to explore the capabilities of advanced LMMs in solving multimodal graph theory problems. It encompasses eight complex graph problem tasks, from connectivity to shortest path problems. Subsequently, we present a Description-Program-Reasoning (DPR) chain to enhance the logical accuracy of reasoning processes through graphical structure description generation and algorithm-aware multi-step reasoning. Our extensive study shows that 1) GPT-4V outperforms Gemini Pro in multi-step graph reasoning; 2) All LMMs exhibit inferior perception accuracy for graphical structures, whether in zero/few-shot settings or with supervised fine-tuning (SFT), which further affects problem-solving performance; 3) DPR significantly improves the multi-step graph reasoning capabilities of LMMs and the GPT-4V (DPR) agent achieves SOTA performance.},
language = {en},
urldate = {2024-10-01},
journal = {arXiv.org},
author = {Li, Yunxin and Hu, Baotian and Shi, Haoyuan and Wang, Wei and Wang, Longyue and Zhang, Min},
month = may,
year = {2024},
}
@misc{zou_vgbench_2024,
title = {{VGBench}: {Evaluating} {Large} {Language} {Models} on {Vector} {Graphics} {Understanding} and {Generation}},
shorttitle = {{VGBench}},
url = {http://arxiv.org/abs/2407.10972},
doi = {10.48550/arXiv.2407.10972},
abstract = {In the realm of vision models, the primary mode of representation is using pixels to rasterize the visual world. Yet this is not always the best or unique way to represent visual content, especially for designers and artists who depict the world using geometry primitives such as polygons. Vector graphics (VG), on the other hand, offer a textual representation of visual content, which can be more concise and powerful for content like cartoons, sketches and scientific figures. Recent studies have shown promising results on processing vector graphics with capable Large Language Models (LLMs). However, such works focus solely on qualitative results, understanding, or a specific type of vector graphics. We propose VGBench, a comprehensive benchmark for LLMs on handling vector graphics through diverse aspects, including (a) both visual understanding and generation, (b) evaluation of various vector graphics formats, (c) diverse question types, (d) wide range of prompting techniques, (e) under multiple LLMs and (f) comparison with VLMs on rasterized representations. Evaluating on our collected 4279 understanding and 5845 generation samples, we find that LLMs show strong capability on both aspects while exhibiting less desirable performance on low-level formats (SVG). Both data and evaluation pipeline will be open-sourced at https://vgbench.github.io.},
urldate = {2024-10-01},
publisher = {arXiv},
author = {Zou, Bocheng and Cai, Mu and Zhang, Jianrui and Lee, Yong Jae},
month = aug,
year = {2024},
note = {arXiv:2407.10972 [cs]},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
}
@article{huang_language_2023,
title = {Language {Is} {Not} {All} {You} {Need}: {Aligning} {Perception} with {Language} {Models}},
volume = {36},
shorttitle = {Language {Is} {Not} {All} {You} {Need}},
url = {https://proceedings.neurips.cc/paper_files/paper/2023/hash/e425b75bac5742a008d643826428787c-Abstract-Conference.html},
language = {en},
urldate = {2024-10-01},
journal = {Advances in Neural Information Processing Systems},
author = {Huang, Shaohan and Dong, Li and Wang, Wenhui and Hao, Yaru and Singhal, Saksham and Ma, Shuming and Lv, Tengchao and Cui, Lei and Mohammed, Owais Khan and Patra, Barun and Liu, Qiang and Aggarwal, Kriti and Chi, Zewen and Bjorck, Nils and Chaudhary, Vishrav and Som, Subhojit and Song, Xia and Wei, Furu},
month = dec,
year = {2023},
pages = {72096--72109},
}
@misc{chen_multi-object_2024,
title = {Multi-{Object} {Hallucination} in {Vision}-{Language} {Models}},
url = {http://arxiv.org/abs/2407.06192},
doi = {10.48550/arXiv.2407.06192},
abstract = {Large vision language models (LVLMs) often suffer from object hallucination, producing objects not present in the given images. While current benchmarks for object hallucination primarily concentrate on the presence of a single object class rather than individual entities, this work systematically investigates multi-object hallucination, examining how models misperceive (e.g., invent nonexistent objects or become distracted) when tasked with focusing on multiple objects simultaneously. We introduce Recognition-based Object Probing Evaluation (ROPE), an automated evaluation protocol that considers the distribution of object classes within a single image during testing and uses visual referring prompts to eliminate ambiguity. With comprehensive empirical studies and analysis of potential factors leading to multi-object hallucination, we found that (1) LVLMs suffer more hallucinations when focusing on multiple objects compared to a single object. (2) The tested object class distribution affects hallucination behaviors, indicating that LVLMs may follow shortcuts and spurious correlations.(3) Hallucinatory behaviors are influenced by data-specific factors, salience and frequency, and model intrinsic behaviors. We hope to enable LVLMs to recognize and reason about multiple objects that often occur in realistic visual scenes, provide insights, and quantify our progress towards mitigating the issues.},
urldate = {2024-10-01},
publisher = {arXiv},
author = {Chen, Xuweiyi and Ma, Ziqiao and Zhang, Xuejun and Xu, Sihan and Qian, Shengyi and Yang, Jianing and Fouhey, David F. and Chai, Joyce},
month = jul,
year = {2024},
note = {arXiv:2407.06192 [cs]},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
}
@article{trinh_solving_2024,
title = {Solving olympiad geometry without human demonstrations},
volume = {625},
copyright = {2024 The Author(s)},
issn = {1476-4687},
url = {https://www.nature.com/articles/s41586-023-06747-5},
doi = {10.1038/s41586-023-06747-5},
abstract = {Proving mathematical theorems at the olympiad level represents a notable milestone in human-level automated reasoning1–4, owing to their reputed difficulty among the world’s best talents in pre-university mathematics. Current machine-learning approaches, however, are not applicable to most mathematical domains owing to the high cost of translating human proofs into machine-verifiable format. The problem is even worse for geometry because of its unique translation challenges1,5, resulting in severe scarcity of training data. We propose AlphaGeometry, a theorem prover for Euclidean plane geometry that sidesteps the need for human demonstrations by synthesizing millions of theorems and proofs across different levels of complexity. AlphaGeometry is a neuro-symbolic system that uses a neural language model, trained from scratch on our large-scale synthetic data, to guide a symbolic deduction engine through infinite branching points in challenging problems. On a test set of 30 latest olympiad-level problems, AlphaGeometry solves 25, outperforming the previous best method that only solves ten problems and approaching the performance of an average International Mathematical Olympiad (IMO) gold medallist. Notably, AlphaGeometry produces human-readable proofs, solves all geometry problems in the IMO 2000 and 2015 under human expert evaluation and discovers a generalized version of a translated IMO theorem in 2004.},
language = {en},
number = {7995},
urldate = {2024-10-01},
journal = {Nature},
author = {Trinh, Trieu H. and Wu, Yuhuai and Le, Quoc V. and He, He and Luong, Thang},
month = jan,
year = {2024},
note = {Publisher: Nature Publishing Group},
keywords = {Computational science, Computer science},
pages = {476--482},
}
@misc{lu_mathvista_2024,
title = {{MathVista}: {Evaluating} {Mathematical} {Reasoning} of {Foundation} {Models} in {Visual} {Contexts}},
shorttitle = {{MathVista}},
url = {http://arxiv.org/abs/2310.02255},
doi = {10.48550/arXiv.2310.02255},
abstract = {Large Language Models (LLMs) and Large Multimodal Models (LMMs) exhibit impressive problem-solving skills in many tasks and domains, but their ability in mathematical reasoning in visual contexts has not been systematically studied. To bridge this gap, we present MathVista, a benchmark designed to combine challenges from diverse mathematical and visual tasks. It consists of 6,141 examples, derived from 28 existing multimodal datasets involving mathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and PaperQA). Completing these tasks requires fine-grained, deep visual understanding and compositional reasoning, which all state-of-the-art foundation models find challenging. With MathVista, we have conducted a comprehensive, quantitative evaluation of 12 prominent foundation models. The best-performing GPT-4V model achieves an overall accuracy of 49.9\%, substantially outperforming Bard, the second-best performer, by 15.1\%. Our in-depth analysis reveals that the superiority of GPT-4V is mainly attributed to its enhanced visual perception and mathematical reasoning. However, GPT-4V still falls short of human performance by 10.4\%, as it often struggles to understand complex figures and perform rigorous reasoning. This significant gap underscores the critical role that MathVista will play in the development of general-purpose AI agents capable of tackling mathematically intensive and visually rich real-world tasks. We further explore the new ability of self-verification, the application of self-consistency, and the interactive chatbot capabilities of GPT-4V, highlighting its promising potential for future research. The project is available at https://mathvista.github.io/.},
urldate = {2024-09-01},
publisher = {arXiv},
author = {Lu, Pan and Bansal, Hritik and Xia, Tony and Liu, Jiacheng and Li, Chunyuan and Hajishirzi, Hannaneh and Cheng, Hao and Chang, Kai-Wei and Galley, Michel and Gao, Jianfeng},
month = jan,
year = {2024},
note = {arXiv:2310.02255 [cs]},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
}
@article{wu_iconshop_2023-1,
title = {{IconShop}: {Text}-{Guided} {Vector} {Icon} {Synthesis} with {Autoregressive} {Transformers}},
volume = {42},
issn = {0730-0301},
shorttitle = {{IconShop}},
url = {https://dl.acm.org/doi/10.1145/3618364},
doi = {10.1145/3618364},
abstract = {Scalable Vector Graphics (SVG) is a popular vector image format that offers good support for interactivity and animation. Despite its appealing characteristics, creating custom SVG content can be challenging for users due to the steep learning curve required to understand SVG grammars or get familiar with professional editing software. Recent advancements in text-to-image generation have inspired researchers to explore vector graphics synthesis using either image-based methods (i.e., text → raster image → vector graphics) combining text-to-image generation models with image vectorization, or language-based methods (i.e., text → vector graphics script) through pretrained large language models. Nevertheless, these methods suffer from limitations in terms of generation quality, diversity, and flexibility. In this paper, we introduce IconShop, a text-guided vector icon synthesis method using autoregressive transformers. The key to success of our approach is to sequentialize and tokenize SVG paths (and textual descriptions as guidance) into a uniquely decodable token sequence. With that, we are able to exploit the sequence learning power of autoregressive transformers, while enabling both unconditional and text-conditioned icon synthesis. Through standard training to predict the next token on a large-scale vector icon dataset accompanied by textural descriptions, the proposed IconShop consistently exhibits better icon synthesis capability than existing image-based and language-based methods both quantitatively (using the FID and CLIP scores) and qualitatively (through formal subjective user studies). Meanwhile, we observe a dramatic improvement in generation diversity, which is validated by the objective Uniqueness and Novelty measures. More importantly, we demonstrate the flexibility of IconShop with multiple novel icon synthesis tasks, including icon editing, icon interpolation, icon semantic combination, and icon design auto-suggestion.},
number = {6},
urldate = {2024-10-01},
journal = {ACM Trans. Graph.},
author = {Wu, Ronghuan and Su, Wanchao and Ma, Kede and Liao, Jing},
month = dec,
year = {2023},
pages = {230:1--230:14},
}
@misc{hsu_scicap_2021,
title = {{SciCap}: {Generating} {Captions} for {Scientific} {Figures}},
shorttitle = {{SciCap}},
url = {https://arxiv.org/abs/2110.11624v2},
abstract = {Researchers use figures to communicate rich, complex information in scientific papers. The captions of these figures are critical to conveying effective messages. However, low-quality figure captions commonly occur in scientific articles and may decrease understanding. In this paper, we propose an end-to-end neural framework to automatically generate informative, high-quality captions for scientific figures. To this end, we introduce SCICAP, a large-scale figure-caption dataset based on computer science arXiv papers published between 2010 and 2020. After pre-processing - including figure-type classification, sub-figure identification, text normalization, and caption text selection - SCICAP contained more than two million figures extracted from over 290,000 papers. We then established baseline models that caption graph plots, the dominant (19.2\%) figure type. The experimental results showed both opportunities and steep challenges of generating captions for scientific figures.},
language = {en},
urldate = {2024-10-01},
journal = {arXiv.org},
author = {Hsu, Ting-Yao and Giles, C. Lee and Huang, Ting-Hao 'Kenneth'},
month = oct,
year = {2021},
}
@inproceedings{jain_vectorfusion_2023,
title = {{VectorFusion}: {Text}-to-{SVG} by {Abstracting} {Pixel}-{Based} {Diffusion} {Models}},
shorttitle = {{VectorFusion}},
url = {https://ieeexplore.ieee.org/document/10204913},
doi = {10.1109/CVPR52729.2023.00190},
abstract = {Diffusion models have shown impressive results in text-to-image synthesis. Using massive datasets of captioned images, diffusion models learn to generate raster images of highly diverse objects and scenes. However, designers frequently use vector representations of images like Scalable Vector Graphics (SVGs) for digital icons or art. Vector graphics can be scaled to any size, and are compact. We show that a text-conditioned diffusion model trained on pixel representations of images can be used to generate SVG-exportable vector graphics. We do so without access to large datasets of captioned SVGs. By optimizing a differentiable vector graphics rasterizer, our method, VectorFusion, distills abstract semantic knowledge out of a pretrained diffusion model. Inspired by recent text-to-3D work, we learn an SVG consistent with a caption using Score Distillation Sampling. To accelerate generation and improve fidelity, VectorFusion also initializes from an image sample. Experiments show greater quality than prior work, and demonstrate a range of styles including pixel art and sketches.},
urldate = {2024-10-01},
booktitle = {2023 {IEEE}/{CVF} {Conference} on {Computer} {Vision} and {Pattern} {Recognition} ({CVPR})},
author = {Jain, Ajay and Xie, Amber and Abbeel, Pieter},
month = jun,
year = {2023},
note = {ISSN: 2575-7075},
keywords = {Art, Computational modeling, Computer vision, Graphics, Image and video synthesis and generation, Pattern recognition, Semantics},
pages = {1911--1920},
}
@misc{belouadi_automatikz_2023,
title = {{AutomaTikZ}: {Text}-{Guided} {Synthesis} of {Scientific} {Vector} {Graphics} with {TikZ}},
shorttitle = {{AutomaTikZ}},
url = {https://arxiv.org/abs/2310.00367v2},
abstract = {Generating bitmap graphics from text has gained considerable attention, yet for scientific figures, vector graphics are often preferred. Given that vector graphics are typically encoded using low-level graphics primitives, generating them directly is difficult. To address this, we propose the use of TikZ, a well-known abstract graphics language that can be compiled to vector graphics, as an intermediate representation of scientific figures. TikZ offers human-oriented, high-level commands, thereby facilitating conditional language modeling with any large language model. To this end, we introduce DaTikZ, the first large-scale TikZ dataset consisting of 120k TikZ drawings aligned with captions. We fine-tune LLaMA on DaTikZ, as well as our new model CLiMA, which augments LLaMA with multimodal CLIP embeddings. In both human and automatic evaluation, CLiMA and LLaMA outperform commercial GPT-4 and Claude 2 in terms of similarity to human-created figures, with CLiMA additionally improving text-image alignment. Our detailed analysis shows that all models generalize well and are not susceptible to memorization. GPT-4 and Claude 2, however, tend to generate more simplistic figures compared to both humans and our models. We make our framework, AutomaTikZ, along with model weights and datasets, publicly available.},
language = {en},
urldate = {2024-10-01},
journal = {arXiv.org},
author = {Belouadi, Jonas and Lauscher, Anne and Eger, Steffen},
month = sep,
year = {2023},
}
@article{ritchie_neurosymbolic_2023,
title = {Neurosymbolic {Models} for {Computer} {Graphics}},
volume = {42},
copyright = {© 2023 Eurographics - The European Association for Computer Graphics and John Wiley \& Sons Ltd.},
issn = {1467-8659},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/cgf.14775},
doi = {10.1111/cgf.14775},
abstract = {Procedural models (i.e. symbolic programs that output visual data) are a historically-popular method for representing graphics content: vegetation, buildings, textures, etc. They offer many advantages: interpretable design parameters, stochastic variations, high-quality outputs, compact representation, and more. But they also have some limitations, such as the difficulty of authoring a procedural model from scratch. More recently, AI-based methods, and especially neural networks, have become popular for creating graphic content. These techniques allow users to directly specify desired properties of the artifact they want to create (via examples, constraints, or objectives), while a search, optimization, or learning algorithm takes care of the details. However, this ease of use comes at a cost, as it's often hard to interpret or manipulate these representations. In this state-of-the-art report, we summarize research on neurosymbolic models in computer graphics: methods that combine the strengths of both AI and symbolic programs to represent, generate, and manipulate visual data. We survey recent work applying these techniques to represent 2D shapes, 3D shapes, and materials \& textures. Along the way, we situate each prior work in a unified design space for neurosymbolic models, which helps reveal underexplored areas and opportunities for future research.},
language = {en},
number = {2},
urldate = {2024-09-22},
journal = {Computer Graphics Forum},
author = {Ritchie, Daniel and Guerrero, Paul and Jones, R. Kenny and Mitra, Niloy J. and Schulz, Adriana and Willis, Karl D. D. and Wu, Jiajun},
year = {2023},
note = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1111/cgf.14775},
keywords = {CCS Concepts, Computer vision, Neural networks, Programming by example, Reflectance modeling, Texturing, • Computing methodologies → Shape modeling, • Software and its engineering → Domain specific languages},
pages = {545--568},
}
@article{jones_shapecoder_2023,
title = {{ShapeCoder}: {Discovering} {Abstractions} for {Visual} {Programs} from {Unstructured} {Primitives}},
volume = {42},
issn = {0730-0301},
shorttitle = {{ShapeCoder}},
url = {https://dl.acm.org/doi/10.1145/3592416},
doi = {10.1145/3592416},
abstract = {We introduce ShapeCoder, the first system capable of taking a dataset of shapes, represented with unstructured primitives, and jointly discovering (i) useful abstraction functions and (ii) programs that use these abstractions to explain the input shapes. The discovered abstractions capture common patterns (both structural and parametric) across a dataset, so that programs rewritten with these abstractions are more compact, and suppress spurious degrees of freedom. ShapeCoder improves upon previous abstraction discovery methods, finding better abstractions, for more complex inputs, under less stringent input assumptions. This is principally made possible by two methodological advancements: (a) a shape-to-program recognition network that learns to solve sub-problems and (b) the use of e-graphs, augmented with a conditional rewrite scheme, to determine when abstractions with complex parametric expressions can be applied, in a tractable manner. We evaluate ShapeCoder on multiple datasets of 3D shapes, where primitive decompositions are either parsed from manual annotations or produced by an unsupervised cuboid abstraction method. In all domains, ShapeCoder discovers a library of abstractions that captures high-level relationships, removes extraneous degrees of freedom, and achieves better dataset compression compared with alternative approaches. Finally, we investigate how programs rewritten to use discovered abstractions prove useful for downstream tasks.},
number = {4},
urldate = {2024-09-22},
journal = {ACM Trans. Graph.},
author = {Jones, R. Kenny and Guerrero, Paul and Mitra, Niloy J. and Ritchie, Daniel},
month = jul,
year = {2023},
pages = {49:1--49:17},
}
@misc{belouadi_detikzify_2024,
title = {{DeTikZify}: {Synthesizing} {Graphics} {Programs} for {Scientific} {Figures} and {Sketches} with {TikZ}},
shorttitle = {{DeTikZify}},
url = {http://arxiv.org/abs/2405.15306},
doi = {10.48550/arXiv.2405.15306},
abstract = {Creating high-quality scientific figures can be time-consuming and challenging, even though sketching ideas on paper is relatively easy. Furthermore, recreating existing figures that are not stored in formats preserving semantic information is equally complex. To tackle this problem, we introduce DeTikZify, a novel multimodal language model that automatically synthesizes scientific figures as semantics-preserving TikZ graphics programs based on sketches and existing figures. To achieve this, we create three new datasets: DaTikZv2, the largest TikZ dataset to date, containing over 360k human-created TikZ graphics; SketchFig, a dataset that pairs hand-drawn sketches with their corresponding scientific figures; and SciCap++, a collection of diverse scientific figures and associated metadata. We train DeTikZify on SciCap++ and DaTikZv2, along with synthetically generated sketches learned from SketchFig. We also introduce an MCTS-based inference algorithm that enables DeTikZify to iteratively refine its outputs without the need for additional training. Through both automatic and human evaluation, we demonstrate that DeTikZify outperforms commercial Claude 3 and GPT-4V in synthesizing TikZ programs, with the MCTS algorithm effectively boosting its performance. We make our code, models, and datasets publicly available.},
urldate = {2024-09-22},
publisher = {arXiv},
author = {Belouadi, Jonas and Ponzetto, Simone Paolo and Eger, Steffen},
month = may,
year = {2024},
note = {arXiv:2405.15306 [cs]},
keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition},
}
@inproceedings{zhao_vector_2024,
title = {Vector {Graphics} {Generation} via {Mutually} {Impulsed} {Dual}-domain {Diffusion}},
url = {https://openaccess.thecvf.com/content/CVPR2024/html/Zhao_Vector_Graphics_Generation_via_Mutually_Impulsed_Dual-domain_Diffusion_CVPR_2024_paper.html},
language = {en},
urldate = {2024-09-22},
author = {Zhao, Zhongyin and Chen, Ye and Hu, Zhangli and Chen, Xuanhong and Ni, Bingbing},
year = {2024},
pages = {4420--4428},
}
@article{Minarcik2024Minkowski,
title = {Minkowski penalties: {Robust} differentiable constraint enforcement for vector graphics},
volume = {43},
doi = {10.1145/3641519.3657495},
number = {4},
journal = {ACM SIGGRAPH 2024 Conference Proceedings},
author = {Minarčík, Jiří and Estep, Sam and Ni, Wode and Crane, Keenan},
year = {2024},
note = {Place: New York, NY, USA
Publisher: ACM},
}
@misc{putta_agent_2024,
title = {Agent {Q}: {Advanced} {Reasoning} and {Learning} for {Autonomous} {AI} {Agents}},
shorttitle = {Agent {Q}},
url = {http://arxiv.org/abs/2408.07199},
doi = {10.48550/arXiv.2408.07199},
abstract = {Large Language Models (LLMs) have shown remarkable capabilities in natural language tasks requiring complex reasoning, yet their application in agentic, multi-step reasoning within interactive environments remains a difficult challenge. Traditional supervised pre-training on static datasets falls short in enabling autonomous agent capabilities needed to perform complex decision-making in dynamic settings like web navigation. Previous attempts to bridge this ga-through supervised fine-tuning on curated expert demonstrations-often suffer from compounding errors and limited exploration data, resulting in sub-optimal policy outcomes. To overcome these challenges, we propose a framework that combines guided Monte Carlo Tree Search (MCTS) search with a self-critique mechanism and iterative fine-tuning on agent interactions using an off-policy variant of the Direct Preference Optimization (DPO) algorithm. Our method allows LLM agents to learn effectively from both successful and unsuccessful trajectories, thereby improving their generalization in complex, multi-step reasoning tasks. We validate our approach in the WebShop environment-a simulated e-commerce platform where it consistently outperforms behavior cloning and reinforced fine-tuning baseline, and beats average human performance when equipped with the capability to do online search. In real-world booking scenarios, our methodology boosts Llama-3 70B model's zero-shot performance from 18.6\% to 81.7\% success rate (a 340\% relative increase) after a single day of data collection and further to 95.4\% with online search. We believe this represents a substantial leap forward in the capabilities of autonomous agents, paving the way for more sophisticated and reliable decision-making in real-world settings.},
urldate = {2024-09-15},
publisher = {arXiv},
author = {Putta, Pranav and Mills, Edmund and Garg, Naman and Motwani, Sumeet and Finn, Chelsea and Garg, Divyansh and Rafailov, Rafael},
month = aug,
year = {2024},
note = {arXiv:2408.07199 [cs]},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning},
}
@misc{zhang_multimodal_2024,
title = {Multimodal {Self}-{Instruct}: {Synthetic} {Abstract} {Image} and {Visual} {Reasoning} {Instruction} {Using} {Language} {Model}},
shorttitle = {Multimodal {Self}-{Instruct}},
url = {http://arxiv.org/abs/2407.07053},
doi = {10.48550/arXiv.2407.07053},
abstract = {Although most current large multimodal models (LMMs) can already understand photos of natural scenes and portraits, their understanding of abstract images, e.g., charts, maps, or layouts, and visual reasoning capabilities remains quite rudimentary. They often struggle with simple daily tasks, such as reading time from a clock, understanding a flowchart, or planning a route using a road map. In light of this, we design a multi-modal self-instruct, utilizing large language models and their code capabilities to synthesize massive abstract images and visual reasoning instructions across daily scenarios. Our strategy effortlessly creates a multimodal benchmark with 11,193 instructions for eight visual scenarios: charts, tables, simulated maps, dashboards, flowcharts, relation graphs, floor plans, and visual puzzles. {\textbackslash}textbf\{This benchmark, constructed with simple lines and geometric elements, exposes the shortcomings of most advanced LMMs\} like Claude-3.5-Sonnet and GPT-4o in abstract image understanding, spatial relations reasoning, and visual element induction. Besides, to verify the quality of our synthetic data, we fine-tune an LMM using 62,476 synthetic chart, table and road map instructions. The results demonstrate improved chart understanding and map navigation performance, and also demonstrate potential benefits for other visual reasoning tasks. Our code is available at: {\textbackslash}url\{https://github.com/zwq2018/Multi-modal-Self-instruct\}.},
urldate = {2024-09-15},
publisher = {arXiv},
author = {Zhang, Wenqi and Cheng, Zhenglin and He, Yuanyu and Wang, Mengna and Shen, Yongliang and Tan, Zeqi and Hou, Guiyang and He, Mingqian and Ma, Yanna and Lu, Weiming and Zhuang, Yueting},
month = aug,
year = {2024},
note = {arXiv:2407.07053 [cs]},
keywords = {Computer Science - Computer Vision and Pattern Recognition},
}
@article{senk_how_1985,
title = {How {Well} {Do} {Students} {Write} {Geometry} {Proofs}?},
url = {https://pubs.nctm.org/view/journals/mt/78/6/article-p448.xml},
doi = {10.5951/MT.78.6.0448},
abstract = {Throughout the history of American education, learning to write proofs has been an important objective of the geometry curriculum for college-bound students. At the same time proof writing has also been perceived as one of the most difficult topics for students to learn. Until recently, the extent of students’ difficulties with writing proofs has been largely a matter of conjecture, for little research has been conducted in this area.},
language = {en},
urldate = {2024-09-12},
author = {Senk, Sharon L.},
month = sep,
year = {1985},
note = {Section: Mathematics Teacher},
}
@article{martin_interplay_2005,
title = {The {Interplay} of {Teacher} and {Student} {Actions} in the {Teaching} and {Learning} of {Geometric} {Proof}},
volume = {60},
issn = {1573-0816},
url = {https://doi.org/10.1007/s10649-005-6698-0},
doi = {10.1007/s10649-005-6698-0},
abstract = {Proof and reasoning are fundamental aspects of mathematics. Yet, how to help students develop the skills they need to engage in this type of higher-order thinking remains elusive. In order to contribute to the dialogue on this subject, we share results from a classroom-based interpretive study of teaching and learning proof in geometry. The goal of this research was to identify factors that may be related to the development of proof understanding. In this paper, we identify and interpret students' actions, teacher's actions, and social aspects that are evident in a classroom in which students discuss mathematical conjectures, justification processes and student-generated proofs. We conclude that pedagogical choices made by the teacher, as manifested in the teacher's actions, are key to the type of classroom environment that is established and, hence, to students' opportunities to hone their proof and reasoning skills. More specifically, the teacher's choice to pose open-ended tasks (tasks which are not limited to one specific solution or solution strategy), engage in dialogue that places responsibility for reasoning on the students, analyze student arguments, and coach students as they reason, creates an environment in which participating students make conjectures, provide justifications, and build chains of reasoning. In this environment, students who actively participate in the classroom discourse are supported as they engage in proof development activities. By examining connections between teacher and student actions within a social context, we offer a first step in linking teachers' practice to students' understanding of proof.},
language = {en},
number = {1},
urldate = {2024-09-12},
journal = {Educational Studies in Mathematics},
author = {Martin, Tami S. and McCrone, Sharon M. Soucy and Bower, Michelle L. Wallace and Dindyal, Jaguthsing},
month = sep,
year = {2005},
keywords = {classroom interaction, geometry, link between teaching and learning, pedagogical choices, proof and reasoning, secondary school mathematics},
pages = {95--124},
}
@article{usiskin_van_1982,
title = {Van {Hiele} {Levels} and {Achievement} in {Secondary} {School} {Geometry}. {CDASSG} {Project}.},
url = {https://eric.ed.gov/?id=ed220288},
abstract = {The Van Hiele level theory offers an explanation and a remedy for student difficulty with higher-order cognitive processes required for success in secondary school geometry. This document reports results of a study which involved about 2700 students in 13 high schools, selected to provide broad representation of community socio-economics in the United States. The investigation looked at: (1) How are entering geometry students distributed with respect to the levels in the Van Hiele scheme; (2) What changes in Van Hiele levels take place after a year's study of geometry; (3) To what extent are levels related to concurrent geometry achievement; (4) To what extent do levels predict geometry achievement after a year's study; (5) What generalizations can be made concerning the entering Van Hiele level and geometry knowledge of students who are later found to be unsuccessful in their study of geometry; (6) To what extent is geometry being taught to students appropriate to their level; and},
language = {en},
urldate = {2024-09-12},
author = {Usiskin, Zalman},
year = {1982},
}
@phdthesis{koedinger_tutoring_1991,
address = {Pittsburgh, PA, US},
title = {Tutoring concepts, percepts, and rules in {Geometry} problem-solving},
url = {https://www.semanticscholar.org/paper/Tutoring-concepts%2C-percepts%2C-and-rules-in-Geometry-Koedinger/7673fe69bc57fd0821fd799ce73228c4702c3dd6},
abstract = {This thesis is a mixture of basic and applied Cognitive Science research focussed on understanding the underlying nature of a complex skill and translating this understanding into improved instruction. The basic research provides a cognitive model of the intuitive and informal proof planning abilities of skilled geometers. This model is implemented as a computer program, an "expert system" that can efficiently plan proof solutions. Previous models characterize geometry problem solving as heuristic search through a problem space of formal geometry rules--a method that originates from the step-by-step approach implied by two-column proof examples in traditional textbooks. In contrast, skilled geometers do not plan proofs in this step-by-step, rule-based fashion. Employing the methodology of verbal protocol analysis, I show that skilled subjects often skip formal steps while planning. Initially, they only consider the key ideas and leave the details for last. The knowledge that affords this efficient and informal planning is not organized around geometry rules as in previous models, but instead around categories of perceptual objects (percepts). Attached to these percepts are clusters of conceptual knowledge that indicate properties and sufficiency conditions relevant to making proof inferences. The cognitive model incorporating this knowledge is both more efficient than previous models and a better match to the human data. This model ties together a number of empirical results on the nature of human expertise and supports an inductive explanation of skill acquisition that is in contrast with the deductive approach typical of some dominant theories of skill acquisition.
The goal of the applied research is to see if this improved understanding of a complex skill can be translated into improved instruction. I built an Intelligent Tutoring System (ITS), called ANGLE, to test this idea. ANGLE was built following specific guidelines (described within) for translating the underlying cognitive model into design specifications for the computer interface and tutoring component. For example, ANGLE's interface reifies the perceptual categories in the cognitive model and, thus, provides students with a novel notation with which to think about geometry. I present a preliminary evaluation of ANGLE comparing it with a previous ITS for geometry.},
urldate = {2024-09-10},
school = {Carnegie Mellon University},
author = {Koedinger, K.},
month = may,
year = {1991},
}
@article{disessa_metarepresentation_2004,
title = {Metarepresentation: {Native} {Competence} and {Targets} for {Instruction}},
volume = {22},
issn = {0737-0008},
shorttitle = {Metarepresentation},
url = {https://doi.org/10.1207/s1532690xci2203_2},
doi = {10.1207/s1532690xci2203_2},
abstract = {The premise of this article is that the study of representation is valuable and important for mathematics and science students. Learning about representation should go beyond learning specific, sanctioned representations emphasized in standard curricula (graphs, tables, etc.) to include principles and design strategies that apply to any scientific representation, including novel variations and even completely new representations. The article explores what it means to understand representation, what we believe students already know about the topic, and what they can profitably learn about it. The discussion includes learning difficulties-goals for instruction that appear challenging for students and may need particular attention.},
number = {3},
urldate = {2019-11-05},
journal = {Cognition and Instruction},
author = {diSessa, Andrea A.},
month = sep,
year = {2004},
pages = {293--331},
}
@article{ainsworth_drawing_2011,
title = {Drawing to learn in science},
volume = {333},
issn = {1095-9203(Electronic),0036-8075(Print)},
doi = {10.1126/science.1204153},
abstract = {Making visualizations is integral to scientific thinking. Scientists do not use words only but rely on diagrams, graphs, videos, photographs, and other images to make discoveries, explain findings, and excite public interest. However, in the science classroom, learners mainly focus on interpreting others' visualizations; when drawing does occur, it is rare that learners are systematically encouraged to create their own visual forms to develop and show understanding. Drawing includes constructing a line graph from a table of values, sketching cells observed through a microscope, or inventing a way to show a scientific phenomenon (e.g., evaporation). In this article, we suggest five reasons why student drawing should be explicitly recognized alongside writing, reading, and talking as a key element in science education. We offer distinct rationales, although in practice any single drawing activity will likely rest upon multiple justifications. Both old and new technologies offer exciting opportunities. We conclude by highlighting important questions yet to be answered and key future research to extend teachers' and learners' use of drawing. (PsycINFO Database Record (c) 2018 APA, all rights reserved)},
number = {6046},
journal = {Science},
author = {Ainsworth, Shaaron and Prain, Vaughan and Tytler, Russell},
year = {2011},
keywords = {Classrooms, Drawing, Imagery, Learning, Science Education, Teaching},
pages = {1096--1097},
}
@inproceedings{head_augmenting_2021,
address = {New York, NY, USA},
series = {{CHI} '21},
title = {Augmenting {Scientific} {Papers} with {Just}-in-{Time}, {Position}-{Sensitive} {Definitions} of {Terms} and {Symbols}},
isbn = {978-1-4503-8096-6},
url = {https://dl.acm.org/doi/10.1145/3411764.3445648},
doi = {10.1145/3411764.3445648},
abstract = {Despite the central importance of research papers to scientific progress, they can be difficult to read. Comprehension is often stymied when the information needed to understand a passage resides somewhere else—in another section, or in another paper. In this work, we envision how interfaces can bring definitions of technical terms and symbols to readers when and where they need them most. We introduce ScholarPhi, an augmented reading interface with four novel features: (1) tooltips that surface position-sensitive definitions from elsewhere in a paper, (2) a filter over the paper that “declutters” it to reveal how the term or symbol is used across the paper, (3) automatic equation diagrams that expose multiple definitions in parallel, and (4) an automatically generated glossary of important terms and symbols. A usability study showed that the tool helps researchers of all experience levels read papers. Furthermore, researchers were eager to have ScholarPhi’s definitions available to support their everyday reading.},
urldate = {2024-09-10},
booktitle = {Proceedings of the 2021 {CHI} {Conference} on {Human} {Factors} in {Computing} {Systems}},
publisher = {Association for Computing Machinery},
author = {Head, Andrew and Lo, Kyle and Kang, Dongyeop and Fok, Raymond and Skjonsberg, Sam and Weld, Daniel S. and Hearst, Marti A.},
month = may,
year = {2021},
keywords = {Defnitions, Interactive documents, Nonce words, Reading interfaces, Scientifc papers},
pages = {1--18},
}
@book{national_council_of_teachers_of_mathematics_principles_2000,
address = {Reston, VA},
title = {Principles and standards for school mathematics.},
isbn = {978-0-87353-480-2},
abstract = {CD-ROM contains: Full text document -- E-examples -- Search engine -- Enhanced navigation -- NCTM's previous Standards documents -- Illuminaitons and links to other resources.},
language = {eng},
publisher = {National Council of Teachers of Mathematics},
author = {{National Council of Teachers of Mathematics}},
year = {2000},
keywords = {Mathematics, Standards, Study and teaching, United States},
}
@article{stylianides_preservice_2007,
title = {Preservice teachers’ knowledge of proof by mathematical induction},
volume = {10},
issn = {1573-1820},
url = {https://doi.org/10.1007/s10857-007-9034-z},
doi = {10.1007/s10857-007-9034-z},
abstract = {There is a growing effort to make proof central to all students’ mathematical experiences across all grades. Success in this goal depends highly on teachers’ knowledge of proof, but limited research has examined this knowledge. This paper contributes to this domain of research by investigating preservice elementary and secondary school mathematics teachers’ knowledge of proof by mathematical induction. This research can inform the knowledge about preservice teachers that mathematics teacher educators need in order to effectively teach proof to preservice teachers. Our analysis is based on written responses of 95 participants to specially developed tasks and on semi-structured interviews with 11 of them. The findings show that preservice teachers from both groups have difficulties that center around: (1) the essence of the base step of the induction method; (2) the meaning associated with the inductive step in proving the implication P(k) ⇒ P(k + 1) for an arbitrary k in the domain of discourse of P(n); and (3) the possibility of the truth set of a sentence in a statement proved by mathematical induction to include values outside its domain of discourse. The difficulties about the base and inductive steps are more salient among preservice elementary than secondary school teachers, but the difficulties about whether proofs by induction should be as encompassing as they could be are equally important for both groups. Implications for mathematics teacher education and future research are discussed in light of these findings.},
language = {en},
number = {3},
urldate = {2024-09-11},
journal = {Journal of Mathematics Teacher Education},
author = {Stylianides, Gabriel J. and Stylianides, Andreas J. and Philippou, George N.},
month = jun,
year = {2007},
keywords = {Collegiate mathematics, Content knowledge, Knowledge fragility, Mathematical induction, Mathematical reasoning, Proof, Tasks, Teacher education},
pages = {145--166},
}
@article{bateson_toward_1956,
title = {Toward a theory of schizophrenia},
volume = {1},
copyright = {Copyright © 1956 John Wiley \& Sons, Ltd.},
issn = {1099-1743},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/bs.3830010402},
doi = {10.1002/bs.3830010402},
abstract = {Schizophrenia—its nature, etiology, and the kind of therapy to use for it—remains one of the most puzzling of the mental illnesses. The theory of schizophrenia presented here is based on communications analysis, and specifically on the Theory of Logical Types. From this theory and from observations of schizophrenic patients is derived a description, and the necessary conditions for, a situation called the “double bind”—a situation in which no matter what a person does, he “can't win.” It is hypothesized that a person caught in the double bind may develop schizophrenic symptoms. How and why the double bind may arise in a family situation is discussed, together with illustrations from clinical and experimental data.},
language = {en},
number = {4},
urldate = {2024-09-11},
journal = {Behavioral Science},
author = {Bateson, Gregory and Jackson, Don D. and Haley, Jay and Weakland, John},
year = {1956},
note = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1002/bs.3830010402},
pages = {251--264},
}
@article{herbst_engaging_2002,
title = {Engaging {Students} in {Proving}: {A} {Double} {Bind} on the {Teacher}},
shorttitle = {Engaging {Students} in {Proving}},
url = {https://pubs.nctm.org/view/journals/jrme/33/3/article-p176.xml},
doi = {10.2307/749724},
abstract = {This article uses a classroom episode in which a teacher and her students undertake a task of proving a proposition about angles as a context for analyzing what is involved in the teacher's work of engaging students in producing a proof. The analysis invokes theoretical notions of didactical contract and double bind to uncover and explain conflicting demands that the practice of assigning two-column proofs imposes on high school teachers. Two aspects of the work of teaching—what teachers do to create a task in which students can produce a proof and what teachers do to get students to prove a proposition—are the focus of the analysis of the episode. This analysis supports the argument that the traditional custom of engaging students in doing formal, two-column proofs places contradictory demands on the teacher regarding how the ideas for a proof will be developed. Recognizing these contradictory demands clarifies why the teacher in the analyzed episode ends up suggesting the key ideas for the proof. The analysis, coupled with current recommendations about the role of proof in school mathematics, suggests that it is advantageous for teachers to avoid treating proof only as a formal process.},
language = {en},
urldate = {2024-09-11},
author = {Herbst, Patricio G.},
month = may,
year = {2002},
note = {Section: Journal for Research in Mathematics Education},
keywords = {Classroom interaction, Communication, Geometry, Knowledge, Proof, Teaching practice},
}
@article{knuth_secondary_2002,
title = {Secondary {School} {Mathematics} {Teachers}' {Conceptions} of {Proof}},
url = {https://pubs.nctm.org/view/journals/jrme/33/5/article-p379.xml},
doi = {10.2307/4149959},
abstract = {Recent reform efforts call on secondary school mathematics teachers to provide all students with rich opportunities and experiences with proof throughout the secondary school mathematics curriculum—opportunities and experiences that reflect the nature and role of proof in the discipline of mathematics. Teachers' success in responding to this call, however, depends largely on their own conceptions of proof. This study examined 16 in-service secondary school mathematics teachers' conceptions of proof. Data were gathered from a series of interviews and teachers' written responses to researcher-designed tasks focusing on proof. The results of this study suggest that teachers recognize the variety of roles that proof plays in mathematics; noticeably absent, however, was a view of proof as a tool for learning mathematics. The results also suggest that many of the teachers hold limited views of the nature of proof in mathematics and demonstrated inadequate understandings of what constitutes proof.},
language = {en},
urldate = {2024-09-11},
author = {Knuth, Eric J.},
month = nov,
year = {2002},
note = {Section: Journal for Research in Mathematics Education},
keywords = {Proof, Secondary mathematics, Teacher beliefs, Teacher knowledge},
}
@article{herbst_when_2008,
title = {When, how, and why prove theorems? {A} methodology for studying the perspective of geometry teachers},
volume = {40},
issn = {1863-9704},
shorttitle = {When, how, and why prove theorems?},
url = {https://doi.org/10.1007/s11858-008-0082-3},
doi = {10.1007/s11858-008-0082-3},
abstract = {While every theorem has a proof in mathematics, in US geometry classrooms not every theorem is proved. How can one explain the practitioner’s perspective on which theorems deserve proof? Toward providing an account of the practical rationality with which practitioners handle the norm that every theorem has a proof we have designed a methodology that relies on representing classroom instruction using animations. We use those animations to trigger commentary from experienced practitioners. In this article we illustrate how we model instructional situations as systems of norms and how we create animated stories that represent a situation. We show how the study of those stories as prototypes of a basic model can help anticipate the response from practitioners as well as suggest issues to be considered in improving a model.},
language = {en},
number = {3},
urldate = {2024-09-11},
journal = {ZDM},
author = {Herbst, Patricio and Miyakawa, Takeshi},
month = aug,
year = {2008},
keywords = {Artificial Intelligence, Base Angle, Isosceles Triangle, Mathematics Teaching, Modeling Enterprise, Practical Rationality},
pages = {469--486},
}
@article{dimmel_what_2018,
title = {What {Details} {Do} {Teachers} {Expect} {From} {Student} {Proofs}? {A} {Study} of {Proof} {Checking} in {Geometry}},
shorttitle = {What {Details} {Do} {Teachers} {Expect} {From} {Student} {Proofs}?},
url = {https://pubs.nctm.org/view/journals/jrme/49/3/article-p261.xml},
doi = {10.5951/jresematheduc.49.3.0261},
abstract = {We investigated how secondary mathematics teachers check student geometry proofs. From video records of geometry teachers checking proofs, we conjectured that teachers have different expectations for details that follow from written statements than for details that are conveyed by diagrams. To test our conjectures, we randomly assigned 44 secondary mathematics teachers to 1 of 3 experiment groups (n \& 13, n \& 15, n \& 16) in which they viewed and rated representations of instructional practice. Participants in each group viewed treatment or control versions of instructional scenarios and rated the appropriateness of the teachers' work in different segments of each scenario. We compared participants' ratings across and within experiment groups. We found that participants rated lower instruction that deviated from what we hypothesized to be their expectations, confirming our hypotheses.},
language = {en},
urldate = {2024-09-11},
author = {Dimmel, Justin K. and Herbst, Patricio G.},
month = may,
year = {2018},
note = {Section: Journal for Research in Mathematics Education},
keywords = {Breaching experiments, Diagrammatic register, Instructional situations, Norms},
}
@phdthesis{sears_examination_nodate,
address = {United States -- Missouri},
type = {Ph.{D}.},
title = {An examination of how teachers use curriculum materials for the teaching of proof in high school geometry},
copyright = {Database copyright ProQuest LLC; ProQuest does not claim copyright in the individual underlying works.},
url = {https://www.proquest.com/docview/1114488216/abstract/922C194DEDD84A37PQ/1},
abstract = {This case study examined how three high school geometry teachers used their geometry textbooks (Prentice Hall Geometry and McDougal Littell Geometry) to teach proof. More specifically, this study examined the following: How subject-specific curriculum materials present proofs related to parallel and perpendicular lines, angles, and congruent triangles? How do geometry teachers use curriculum materials to facilitate students learning to prove? What factors influences teachers’ decision to deviate or not from curriculum materials? Data were collected via a classroom observation protocol, teacher artifacts, audio and video classroom recording, and teacher interviews. A conceptual analytical framework, which consisted of three dimensions, comprised of the Mathematical Tasks Framework (Henningsen \& Stein, 1997) and proof schemes framework (Harel \& Sowder, 1998) was used to analyzed the data. The first dimension focused on task features, the second on levels of cognitive demands and the third considered the proof schemes utilized.
To inform the classroom observations and the data analysis, a textbook analysis was conducted of proof and proof-related tasks. This analysis considered the frequency of proof and proof-related tasks, types of proof representations used, real world or abstract context of proofs, use of figures, occurrences of fill in the blank and multiple choice tasks, and the extent to which tasks were composed of multiple parts. Additionally, the levels of cognitive demand of tasks were evaluated. During classroom observations, attention was given to what constitutes convincing proof arguments, and how curriculum materials were utilized.
The data analysis showed that the geometry curriculum materials used by the teachers in this study provided few opportunities to prove, and that there were differences between textbook series in the tasks’ features and the levels of cognitive demand of the proof tasks included. Additionally, the teachers in this study enacted proof tasks generally by promoting memorization or procedures without connections. Moreover, whenever lower levels cognitive demand tasks were posed external conviction proof schemes were more evident; while analytical proof schemes appeared more frequently when higher-level cognitive demand tasks were posed. Furthermore, teachers’ beliefs, experience, desire to make mathematics “easy”, professional community, and assessment were factors that contributed to how proof was taught.},
language = {English},
urldate = {2024-09-11},
school = {University of Missouri - Columbia},
author = {Sears, Ruthmae},
note = {ISBN: 9781267708878},
keywords = {Cognitive demands, Curriculum materials, Education, Geometry, Mathematics education, Proof, Reasoning, Secondary education},
}
@article{broda_pandora_2007,
title = {Pandora: {A} {Reasoning} {Toolbox} using {Natural} {Deduction} {Style}},
volume = {15},
issn = {1368-9894},
shorttitle = {Pandora},
url = {https://ieeexplore.ieee.org/document/8131694},
doi = {10.1093/jigpal/jzm020},
abstract = {Pandora is a tool for supporting the learning of first order natural deduction. It includes a help window, an interactive context sensitive tutorial known as the “e-tutor” and facilities to save, reload and export to LATEX. Every attempt to apply a natural deduction rule is met with either success or a helpful error message, providing the student with instant feedback. Detailed electronic logs of student usage are recorded for evaluation purposes. This paper describes the basic functionality, the e-tutor, our experiences of using the tool in teaching and our future plans.},
number = {4},
urldate = {2024-09-10},
journal = {Logic Journal of the IGPL},
author = {Broda, Krysia and Ma, Jiefei and Sinnadurai, Gabrielle and Summers, Alexander},
month = aug,
year = {2007},
note = {Conference Name: Logic Journal of the IGPL},
keywords = {Fitch box proof, e-learning, first order logic, learning, natural deduction, predicate logic, reasoning about programs, teaching},
pages = {293--304},
}
@article{gruetter_live_2024,
title = {Live {Verification} in an {Interactive} {Proof} {Assistant}},
volume = {8},
url = {https://dl.acm.org/doi/10.1145/3656439},
doi = {10.1145/3656439},
abstract = {We present a prototype for a tool that enables programmers to verify their code as they write it in real-time.
After each line of code that the programmer writes, the tool tells the programmer whether it was able to prove absence of undefined behavior so far, and it displays a concise representation of the symbolic state of the program right after the added line.
The user can then either write the next line of code, or if needed or desired, write a specially marked comment that provides hints on how to solve side conditions or on how to represent the symbolic state more nicely.
Once the programmer has finished writing the program, it is already verified with a mathematical correctness proof.
Other tools providing real-time feedback already exist, but ours is the first one that only relies on a small trusted proof checker and that provides a concise summary of the symbolic state at the point in the program currently being edited, as opposed to only indicating whether user-stated assertions or postconditions hold.
Program verification requires loop invariants, which are hard to find and tedious to spell out.
We explore a middle ground in the design space between the two extremes of requiring users to spell out loop invariants manually and attempting to infer loop invariants automatically:
Since a loop invariant often looks quite similar to the symbolic state right before the loop, our tool asks the user to express the desired loop invariant as a diff from the symbolic state before the loop, which has the potential to lead to shorter, more maintainable proofs.
We prototyped our technique in the interactive proof assistant Coq, so our framework creates machine-checked proofs that the developed functions satisfy their specifications when executed according to the formal semantics of the source language.
Using a verified compiler proven against the same source-language semantics, we can ensure that the behavior of the compiled program matches the program's behavior as represented by the framework during the proof.
Additionally, since our polyglot source files can be viewed as Coq or C files at the same time, users willing to accept a larger trusted code base can compile them with GCC.},
number = {PLDI},
urldate = {2024-09-10},
journal = {Code Artifact for Live Verification in an Interactive Proof Assistant},
author = {Gruetter, Samuel and Fukala, Viktor and Chlipala, Adam},
month = jun,
year = {2024},
pages = {209:1535--209:1558},
}
@article{dyer_applying_2022,
title = {Applying cognitive principles to model-finding output: the positive value of negative information},
volume = {6},
shorttitle = {Applying cognitive principles to model-finding output},
url = {https://dl.acm.org/doi/10.1145/3527323},
doi = {10.1145/3527323},
abstract = {Model-finders, such as SAT/SMT-solvers and Alloy, are used widely both directly and embedded in domain-specific tools. They support both conventional verification and, unlike other verification tools, property-free exploration. To do this effectively, they must produce output that helps users with these tasks. Unfortunately, the output of model-finders has seen relatively little rigorous human-factors study. Conventionally, these tools tend to show one satisfying instance at a time. Drawing inspiration from the cognitive science literature, we investigate two aspects of model-finder output: how many instances to show at once, and whether all instances must actually satisfy the input constraints. Using both controlled studies and open-ended talk-alouds, we show that there is benefit to showing negative instances in certain settings; the impact of multiple instances is less clear. Our work is a first step in a theoretically grounded approach to understanding how users engage cognitively with model-finder output, and how those tools might better support users in doing so.},
number = {OOPSLA1},
urldate = {2024-09-10},
journal = {Applying Cognitive Principles to Model-Finding Output: The Positive Value of Negative Information (artifact)},
author = {Dyer, Tristan and Nelson, Tim and Fisler, Kathi and Krishnamurthi, Shriram},
month = apr,
year = {2022},
pages = {79:1--79:29},
}
@article{walcott_making_2009,
title = {Making sense of shape: {An} analysis of children's written responses},
volume = {28},
issn = {0732-3123},
shorttitle = {Making sense of shape},
url = {https://www.sciencedirect.com/science/article/pii/S0732312309000194},
doi = {10.1016/j.jmathb.2009.04.001},
abstract = {In this study, we examine a large set of student responses to a constructed-response geometry item on the National Assessment of Educational Progress (NAEP) administered in 1992 and 1996. The item asks students to name the similarities and differences between a parallelogram and a rectangle of equal area presented side by side on a grid. Through categorization of student work utilizing the constant comparison method [Lincoln, Y. S., \& Guba, E. G. (1985). Naturalistic Inquiry, Beverly Hills, CA: Sage Publications], we identified two distinct categories of responses. The first group of responses indicates that students view the two figures as the same based on flexible prototypes while the second group indicates students view the two figures as distinct shapes based on inflexible prototypes. The research uncovers responses that show evidence of students’ development of a dynamic figural concept in which student understanding is based on sense-making that involves mentally manipulating the shapes. In addition, the research highlights the richness of the evidence obtained from constructed-response items administered as part of standardized assessments.},
number = {1},
urldate = {2024-09-10},
journal = {The Journal of Mathematical Behavior},
author = {Walcott, Crystal and Mohr, Doris and Kastberg, Signe E.},
month = mar,
year = {2009},
keywords = {Children's thinking, Constructed-response assessment, Geometry concept formation, Shape classification strategies},
pages = {30--40},
}
@article{bodemer_active_2004,
series = {Dynamic {Visualisations} and {Learning}},
title = {The active integration of information during learning with dynamic and interactive visualisations},
volume = {14},
issn = {0959-4752},
url = {https://www.sciencedirect.com/science/article/pii/S0959475204000350},
doi = {10.1016/j.learninstruc.2004.06.006},
abstract = {Computer-based learning environments commonly comprise symbolic as well as static and dynamic pictorial representations, frequently combined with the possibility of modifying them interactively. While multiple, dynamic and interactive external representations have the potential to improve learning in various ways, they also place specific demands on learners, such as the need to process and relate different representations, to control and evaluate interactions with these representations, and to construct coherent mental representations. Because learners frequently are not able to meet these demands, the presentation of multiple, dynamic and interactive representations might not only not improve but might even impede learning. Starting from cognitive load theory as well as from structure mapping theory, we developed support measures that encouraged learners to actively integrate symbolic and pictorial representations and to interact with dynamic pictorial representations in a structured and reflective way. In two experimental studies, the learning effects of the encouraged activities were evaluated. Analyses of variance revealed (1) that the active integration of different representations improved learning significantly and (2) that the structured interaction with different representations specifically increased verbal understanding.},
number = {3},
urldate = {2024-09-10},
journal = {Learning and Instruction},
author = {Bodemer, Daniel and Ploetzner, Rolf and Feuerlein, Inge and Spada, Hans},
month = jun,
year = {2004},
keywords = {Cognitive load, Dynamic visualisations, Multimedia, Multiple external representations, Simulations, Structure mapping},
pages = {325--341},
}
@article{fischbein_theory_1993,
title = {The theory of figural concepts},
volume = {24},
issn = {1573-0816},
url = {https://doi.org/10.1007/BF01273689},
doi = {10.1007/BF01273689},
abstract = {The main thesis of the present paper is that geometry deals with mental entities (the so-called geometrical figures) which possess simultaneously conceptual and figural characters. A geometrical sphere, for instance, is an abstract ideal, formally determinable entity, like every genuine concept. At the same time, it possesses figural properties, first of all a certain shape. The ideality, the absolute perfection of a geometrical sphere cannot be found in reality. In this symbiosis between concept and figure, as it is revealed in geometrical entities, it is the image component which stimulates new directions of thought, but there are the logical, conceptual constraints which control the formal rigour of the process. We have called the geometrical figuresfigural concepts because of their double nature. The paper analyzes the internal tensions which may appear in figural concepts because of this double nature, development aspects and didactical implications.},
language = {en},
number = {2},
urldate = {2024-09-10},
journal = {Educational Studies in Mathematics},
author = {Fischbein, Efraim},
month = feb,
year = {1993},
keywords = {Development Aspect, Figural Property, Geometrical Figure, Image Component, Internal Tension},
pages = {139--162},
}
@incollection{manders_euclidean_2008,
title = {The {Euclidean} {Diagram}},
booktitle = {The {Philosophy} of {Mathematical} {Practice}},
publisher = {Oxford University Press},
author = {Manders, Kenneth},
editor = {Mancosu, Paolo},
year = {2008},
pages = {80--133},
}
@inproceedings{wu_ffl_2023,
address = {New York, NY, USA},
series = {{UIST} '23},
title = {{FFL}: {A} {Language} and {Live} {Runtime} for {Styling} and {Labeling} {Typeset} {Math} {Formulas}},
isbn = {9798400701320},
shorttitle = {{FFL}},
url = {https://dl.acm.org/doi/10.1145/3586183.3606731},
doi = {10.1145/3586183.3606731},
abstract = {As interest grows in learning math concepts in fields like data science and machine learning, it is becoming more important to help broad audiences engage with math notation. In this paper, we explore how authoring tools can help authors better style and label formulas to support their readability. We introduce a markup language for augmenting formulas called FFL, or “Formula Formatting Language,” which aims to lower the threshold to stylize and diagram formulas. The language is designed to be concise, writable, readable, and integrable into web-based document authoring environments. It was developed with an accompanying runtime that supports live application of augmentations to formulas. Our lab study shows that FFL improves the speed and ease of editing augmentation markup, and the readability of augmentation markup compared to baseline LaTeX tools. These results clarify the role tooling can play in supporting the explanation of math notation.},
urldate = {2024-09-10},
booktitle = {Proceedings of the 36th {Annual} {ACM} {Symposium} on {User} {Interface} {Software} and {Technology}},
publisher = {Association for Computing Machinery},
author = {Wu, Zhiyuan and Li, Jiening and Ma, Kevin and Kambhamettu, Hita and Head, Andrew},
month = oct,
year = {2023},
pages = {1--16},
}
@inproceedings{head_math_2022,
address = {New York, NY, USA},
series = {{CHI} '22},
title = {Math {Augmentation}: {How} {Authors} {Enhance} the {Readability} of {Formulas} using {Novel} {Visual} {Design} {Practices}},
isbn = {978-1-4503-9157-3},
shorttitle = {Math {Augmentation}},
url = {https://dl.acm.org/doi/10.1145/3491102.3501932},
doi = {10.1145/3491102.3501932},
abstract = {With the increasing growth and impact of machine learning and other math-intensive fields, it is more important than ever to broaden access to mathematical notation. Can new visual and interactive displays help a wider readership successfully engage with notation? This paper provides the first detailed qualitative analysis of math augmentation—the practice of embellishing notation with novel visual design patterns to improve its readability. We present two qualitative studies of the practice of math augmentation. First is an analysis of 1.1k augmentations to 281 formulas in 47 blogs, textbooks, and other documents containing mathematical expressions. Second is an interview study with 12 authors who had previously designed custom math augmentations (“maugs”). This paper contributes a comprehensive inventory of the kinds of maugs that appear in math documents, and a detailed account of how authors’ tools ought to be redesigned to support efficient creation of math augmentations. These studies open a critical new design space for HCI researchers and interface designers.},
urldate = {2024-09-10},
booktitle = {Proceedings of the 2022 {CHI} {Conference} on {Human} {Factors} in {Computing} {Systems}},
publisher = {Association for Computing Machinery},
author = {Head, Andrew and Xie, Amber and Hearst, Marti A.},
month = apr,
year = {2022},
pages = {1--18},
}
@misc{ball_teaching_2003,
title = {The teaching of proof},
url = {http://arxiv.org/abs/math/0305021},
doi = {10.48550/arXiv.math/0305021},
abstract = {This panel draws on research of the teaching of mathematical proof, conducted in five countries at different levels of schooling. With a shared view of proof as essential to the teaching and learning of mathematics, the authors present results of studies that explore the challenges for teachers in helping students learn to reason in disciplined ways about mathematical claims.},
urldate = {2024-09-09},
publisher = {arXiv},
author = {Ball, Deborah Loewenberg and Hoyles, Celia and Jahnke, Hans Niels and Movshovitz-Hadar, Nitsa},
month = apr,
year = {2003},
note = {arXiv:math/0305021},
keywords = {97C30, 97C50, 97D20, Mathematics - History and Overview},
}
@article{stylianides_proof_2007,
title = {Proof and {Proving} in {School} {Mathematics}},
url = {https://pubs.nctm.org/view/journals/jrme/38/3/article-p289.xml},
doi = {10.2307/30034869},
abstract = {Many researchers and curriculum frameworks recommend that the concept of proof and the corresponding activity of proving become part of students' mathematical experiences throughout the grades. Yet it is still unclear what “proof” means in school mathematics, especially in the elementary grades, and what role teachers have in cultivating proof and proving among their students. In this article, I propose a conceptualization of the meaning of proof in school mathematics and use classroom episodes from third grade to elaborate elements of this conceptualization and to illustrate its applicability even in the early elementary grades. Furthermore, I use the conceptualization to develop a tool to analyze the classroom episodes and to examine aspects of the teachers' role in managing their students' proving activity. This analysis supports the development of a framework about instructional practices for cultivating proof and proving in school mathematics.},
language = {en},
urldate = {2024-09-09},
author = {Stylianides, Andreas L.},
month = may,
year = {2007},
note = {Section: Journal for Research in Mathematics Education},
keywords = {All school levels, Instructional intervention, Logic and proof, Proof, Qualitative methods, Reasoning},
}
@article{nathan_embodied_2021,
title = {Embodied geometric reasoning: {Dynamic} gestures during intuition, insight, and proof.},
volume = {113},
copyright = {http://www.apa.org/pubs/journals/resources/open-access.aspx},
issn = {1939-2176, 0022-0663},
shorttitle = {Embodied geometric reasoning},
url = {https://doi.apa.org/doi/10.1037/edu0000638},
doi = {10.1037/edu0000638},
abstract = {Grounded and embodied cognition (GEC) serves as a framework to investigate mathematical reasoning for proof (reasoning that is logical, operative and general), insight (gist) and intuition (snap judgment). Geometry is the branch of mathematics concerned with generalizable properties of shape and space. Mathematics experts (N=46) and non-experts (N=44) were asked to judge the truth and to justify their judgments for four geometry conjectures. Videotaped interviews were transcribed and coded for occurrences of gestures and speech during the proof production process. Analyses provide empirical support for claims that geometry proof production is an embodied activity, even when controlling for math expertise, language use and spatial ability. Dynamic depictive gestures portray generalizable properties of shape and space through enactment of transformational operations (e.g., dilation, skewing). Occurrence of dynamic depictive gestures and non-dynamic depictive gestures are associated with proof performance, insight, and intuition, as hypothesized, over and above contributions of spoken language. Geometry knowledge for proof may be embodied and accessed and revealed through actions and the transformational speech utterances describing these actions. These findings have implications for instruction, assessment of embodied knowledge, and the design of educational technology to facilitate mathematical reasoning by promoting and tracking dynamic gesture production and transformational speech.},
language = {en},
number = {5},
urldate = {2024-09-09},