-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimplementation.lyx
8010 lines (6027 loc) · 176 KB
/
implementation.lyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#LyX 2.2 created this file. For more info see http://www.lyx.org/
\lyxformat 508
\begin_document
\begin_header
\save_transient_properties true
\origin unavailable
\textclass extreport
\use_default_options true
\begin_modules
fixltx2e
fix-cm
theorems-ams-bytype
\end_modules
\maintain_unincluded_children false
\language british
\language_package default
\inputencoding auto
\fontencoding global
\font_roman "default" "default"
\font_sans "default" "default"
\font_typewriter "default" "default"
\font_math "auto" "auto"
\font_default_family default
\use_non_tex_fonts false
\font_sc false
\font_osf false
\font_sf_scale 100 100
\font_tt_scale 100 100
\graphics default
\default_output_format default
\output_sync 0
\bibtex_command default
\index_command default
\paperfontsize default
\spacing single
\use_hyperref false
\papersize default
\use_geometry false
\use_package amsmath 1
\use_package amssymb 1
\use_package cancel 1
\use_package esint 1
\use_package mathdots 1
\use_package mathtools 1
\use_package mhchem 1
\use_package stackrel 1
\use_package stmaryrd 1
\use_package undertilde 1
\cite_engine basic
\cite_engine_type default
\biblio_style plain
\use_bibtopic false
\use_indices false
\paperorientation portrait
\suppress_date true
\justification true
\use_refstyle 1
\index Index
\shortcut idx
\color #008000
\end_index
\secnumdepth 3
\tocdepth 1
\paragraph_separation indent
\paragraph_indentation default
\quotes_language english
\papercolumns 1
\papersides 1
\paperpagestyle default
\listings_params "basicstyle={\ttfamily\small}"
\tracking_changes false
\output_changes false
\html_math_output 0
\html_css_as_file 0
\html_be_strict false
\end_header
\begin_body
\begin_layout Chapter
\start_of_appendix
\begin_inset CommandInset label
LatexCommand label
name "chap:Runtime-Implementation"
\end_inset
Runtime Implementation
\end_layout
\begin_layout Standard
The implementation of the ideas presented in this thesis, in the form of
the Pony compiler and the Pony runtime, represents a significant amount
of software engineering as well as a certain amount of systems research.
The purpose of this appendix is to serve as a guide to some of the subtler
aspects of the implementation of the Pony runtime library.
The elements of the runtime explained here are those most influenced by
the type system and garbage collection work that represent the bulk of
this thesis.
The dissection of the subtler aspects of the Pony compiler is left as an
exercise for the reader.
\end_layout
\begin_layout Section
Component Architecture
\end_layout
\begin_layout Standard
The Pony runtime consists of several core components.
Figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:Runtime-architecture"
\end_inset
shows the structure of those components.
At the bottom is the pool allocator, which handles memory allocation and
deallocation.
On top of this is built the Single-Producer Multiple-Consumer (SPMC) queue,
the Multiple-Producer Single-Consumer (MPSC) queue, and the page map.
SPMC and MPSC queues are used to build the scheduler, an MPSC queue is
used to build the asynchronous I/O handler, and the page map is used to
build per-actor heaps.
Another MPSC queue and the heap are used to build the actors themselves,
which depend on both the scheduler and the asynchronous I/O handler.
Tracing garbage collection (GC), sharing GC, and actor GC are built on
top of actors, heaps, the page map, and the pool allocator.
\end_layout
\begin_layout Standard
Sharing GC, as described in chapter
\begin_inset CommandInset ref
LatexCommand ref
reference "chap:Object-Collection"
\end_inset
, uses tracing GC during message send and receive.
Actor GC, as described in chapter
\begin_inset CommandInset ref
LatexCommand ref
reference "chap:Actor-Collection"
\end_inset
, relies on sharing GC in order to track implicit references from some actor
\begin_inset Formula $\alpha_{1}$
\end_inset
to some actor
\begin_inset Formula $\alpha_{2}$
\end_inset
that arise due to
\begin_inset Formula $\alpha_{1}$
\end_inset
having a reference to some object
\begin_inset Formula $\omega$
\end_inset
that is owned by
\begin_inset Formula $\alpha_{2}$
\end_inset
, as shown in section
\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Reachability"
\end_inset
.
In addition, tracing GC invokes sharing GC when objects owned by other
actors are encountered during the trace.
\end_layout
\begin_layout Standard
\begin_inset Float figure
placement h
wide false
sideways false
status open
\begin_layout Plain Layout
\align center
\begin_inset Graphics
filename graphics/graphics.001.png
width 50col%
\end_inset
\end_layout
\begin_layout Plain Layout
\begin_inset Caption Standard
\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:Runtime-architecture"
\end_inset
Runtime architecture
\end_layout
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Standard
Every component's design is influenced by the type system.
This influence is particularly felt in the garbage collection components,
but the impact is felt at every level, including the memory allocator.
\end_layout
\begin_layout Section
Naming Conventions
\end_layout
\begin_layout Standard
The Pony runtime is implemented in C, and maintains a C API and ABI in order
to be usable from programming languages other than Pony.
To avoid conflicts with symbols exported from other libraries, functions
and types intended to be used at the language level are prefixed with
\family typewriter
pony_
\family default
, while functions intended to be used only internally by the runtime itself
are prefixed with
\family typewriter
ponyint_
\family default
.
Types that are only used internally have no prefix, since they do not appear
in the symbol table or any publicly accessible header, and so do not cause
conflicts.
\end_layout
\begin_layout Section
Pool Allocator
\end_layout
\begin_layout Standard
General purpose memory allocation is a complex problem.
Manual memory management allocators have seen significant research in malloc-li
ke allocators that scale better as core counts increase, including
\family typewriter
tcmalloc
\family default
\begin_inset CommandInset citation
LatexCommand cite
key "ghemawat2009tcmalloc"
\end_inset
,
\family typewriter
jemalloc
\family default
\begin_inset CommandInset citation
LatexCommand cite
key "evans2011scalable"
\end_inset
, the
\family typewriter
Intel TBB
\family default
allocator
\begin_inset CommandInset citation
LatexCommand cite
key "reinders2007intel"
\end_inset
, and most recently
\family typewriter
scalloc
\family default
\begin_inset CommandInset citation
LatexCommand cite
key "aigner2015fast"
\end_inset
.
This category of memory allocator faces several related problems: finding
free memory large enough to fulfil the requested allocation size without
wasting excessive memory, coalescing freed memory to allow larger allocations
to reuse memory from multiple smaller freed memory blocks, and coordinating
allocation across threads.
Similar problems emerge with automatic memory management allocators, with
the additional complication that the garbage collector must interact with
the allocator, and that a compacting garbage collector may move objects
in memory.
\end_layout
\begin_layout Standard
The Pony memory allocator is divided into three parts: an underlying pool
allocator, a page map, and a heap implementation built on top.
We'll begin by examining the pool allocator.
It is important to note that the pool allocator is not directly used for
language-level allocation.
That is, when a new object is allocated in Pony, it is allocated from the
currently executing actor's heap, rather than directly from the pool allocator.
The pool allocator is used for runtime data structures, including actor
heaps themselves.
\end_layout
\begin_layout Standard
To reduce contention, the pool allocator operates as a thread-local allocator.
When we examine the scheduler, we'll see that this results in each core
having a pool allocator that is pinned to that core.
This pool allocator is responsible for pulling new pages from the kernel
when necessary, managing size classed free lists, and making free lists
available to other thread-local pool allocators when possible.
\end_layout
\begin_layout Standard
When a pool allocator is asked to allocate memory, it tries three sources,
in order:
\end_layout
\begin_layout Enumerate
The thread-local size classed free list for amount of memory requested,
rounded to the next higher size class.
\end_layout
\begin_layout Enumerate
The global size classed list of free lists for that size class, which is
used to prevent producer-consumer allocation starvation.
\end_layout
\begin_layout Enumerate
The thread-local free block.
\end_layout
\begin_layout Enumerate
The operating system, by allocating new pages.
\end_layout
\begin_layout Subsection
Size Classes
\end_layout
\begin_layout Standard
Size classes are used to simplify free list management.
This approach segregates free lists by the size of the allocation, and
limits the available allocation sizes.
The implementation currently uses power-of-two sized classes ranging from
\begin_inset Formula $2^{5}$
\end_inset
to
\begin_inset Formula $2^{20}$
\end_inset
bytes, but these values are easily tuneable.
Allocations exceeding the largest size class begin with the thread-local
free block.
\end_layout
\begin_layout Standard
Such segregated free lists have a cost, in that allocations sizes are rounded
up to a size class.
For example, using power-of-two size classes, a 65 byte allocation request
will instead allocate 128 bytes.
However, in return, allocations do not require that memory in the allocated
block be used to indicate the allocated size, and allocations are always
cache line aligned.
\end_layout
\begin_layout Standard
In addition, no free list searching is required on allocation.
Using the head of the free list for a size class approximates a best-fit
search of all free memory, which improves memory utilisation and reduces
fragmentation.
\end_layout
\begin_layout Subsection
Thread-local Size Classed Free List
\end_layout
\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open
\begin_layout Plain Layout
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
typedef struct pool_item_t {
\end_layout
\begin_layout Plain Layout
struct pool_item_t* next;
\end_layout
\begin_layout Plain Layout
} pool_item_t;
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\begin_inset Caption Standard
\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:A-thread-local-free"
\end_inset
A thread-local free list node
\end_layout
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Standard
Each pool allocator keeps a linked list of free memory for each size class.
Initially, these free lists are empty, which is encoded as a
\family typewriter
null
\family default
pointer.
\end_layout
\begin_layout Standard
When the pool allocator is told to free memory within a size class, it prepends
that memory to the appropriate free list.
The memory being freed is reused as the linked list node, so that there
is no memory overhead associated with free memory, as shown in figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:A-thread-local-free"
\end_inset
.
\end_layout
\begin_layout Standard
The memory is prepended, following a last-in first-out strategy, in order
to improve cache locality: if memory of that size class is quickly allocated
and used again, it is more likely that the memory remains in cache.
\end_layout
\begin_layout Standard
When memory is freed to a thread-local free list, the provenance of the
memory is not examined.
Any pool allocator can place memory allocated by any other pool allocator
on its thread-local free list for that size class.
As a result, it is not necessary to perform any checks as to the origin
of memory when it is freed.
This differs from
\emph on
arena
\emph default
based allocators, such as
\family typewriter
jemalloc
\family default
\begin_inset CommandInset citation
LatexCommand cite
key "evans2011scalable"
\end_inset
, that must return free memory to the allocating arena.
For the Pony runtime, this behaviour is critical, as it allows both runtime
data structures and heap chunks to be efficiently freed regardless of which
thread an actor executes on.
On the other hand, this approach makes coalescing free memory, either to
reuse as a different size class or to return to the operating system, more
difficult.
The pool allocator relies on a stable state for a program rather than coalescin
g, which is problematic for some applications.
\end_layout
\begin_layout Subsection
Global List of Size Classed Free Lists
\end_layout
\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open
\begin_layout Plain Layout
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
typedef struct pool_central_t {
\end_layout
\begin_layout Plain Layout
pool_item_t* next;
\end_layout
\begin_layout Plain Layout
uintptr_t length;
\end_layout
\begin_layout Plain Layout
struct pool_central_t* central;
\end_layout
\begin_layout Plain Layout
} pool_central_t;
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\begin_inset Caption Standard
\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:A-global-list"
\end_inset
A global list of free lists node
\end_layout
\end_inset
\end_layout
\end_inset
\end_layout
\begin_layout Standard
The system as a whole also keeps a single global list of free lists for
each size class.
This is to avoid memory starvation in a producer-consumer scenario where
one scheduler thread is consistently allocating memory and another scheduler
thread is consistently freeing it.
\end_layout
\begin_layout Standard
When a thread-local free list exceeds a certain size, defaulting to
\begin_inset Formula $2^{20}$
\end_inset
bytes, that free list is pushed to a global list of free lists.
The maximum size of a thread-local free list is a global compile-time constant,
rather than being adaptive.
It is possible that an adaptive approach could yield additional performance
benefits, at the cost of evaluating some heuristic.
\end_layout
\begin_layout Standard
The first linked list node in the thread-local free list is reinterpreted
as the first node in a global list of lists.
This involves reusing available space in the node
\emph on
after
\emph default
the pointer to the next node in the thread-local free list to record both
the length of the current list and a pointer to the next global list, as
shown in figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:A-global-list"
\end_inset
.
Note that a
\family typewriter
pool_central_t
\family default
maintains the same initial structure as a
\family typewriter
pool_item_t
\family default
.
\end_layout
\begin_layout Standard
To make this possible, the minimum allocation size must be sufficient to
hold all three values, i.e.
24 bytes in a 64-bit runtime or 12 bytes in a 32-bit runtime.
The minimum allocation size is a compile-time constant that defaults to
32 bytes.
\end_layout
\begin_layout Standard
An atomic compare-and-swap loop is used to change the global list of free
lists pointer for the size class being pushed, so that any number of pool
allocators can safely attempt to push a free list to the global list of
lists simultaneously.
\end_layout
\begin_layout Standard
Similarly, when a pool allocator's thread-local free list is empty, it examines
the global list of free lists.
Crucially, if that global list is empty, which is encoded as a
\family typewriter
null
\family default
pointer, no atomic operation is required, and the pool allocator tries
its next source of free memory.
If the global list of free lists contains one or more lists, an atomic
compare-and-swap loop is used to remove one list from global list.
That list is then used as the thread-local free list, allowing memory to
be returned immediately.
\end_layout
\begin_layout Subsection
Thread-local Free Block
\end_layout
\begin_layout Standard
A pool allocator also keeps a thread-local free block.
This is a list of free memory, insert-sorted by size, that is initially
empty.
If no size classed memory is available, the first free block in this list
large enough to fulfil the allocation is used.
If a block is available, it is fragmented into the requested size and any
remaining memory, with any remaining memory being insert-sorted back into
the free block.
\end_layout
\begin_layout Standard
This approach allows large allocations to be reused for future large allocations
, or fragmented into future size classed allocations.
It also allows freed large allocations to coexist with new pages pulled
from the operating system.
\end_layout
\begin_layout Subsection
New Pages
\end_layout
\begin_layout Standard
If the thread-local and global size classed pools are empty, and the free
block is either empty or contains no memory block large enough to fulfil
the allocation request, the pool allocator uses an underlying system call
(
\family typewriter
mmap
\family default
on UNIX-like operating systems,
\family typewriter
VirtualAlloc
\family default
on Windows) to request a number of pages mapped as a contiguous chunk of
address space.
Some empirical testing with various work loads has lead to choosing a 128
megabyte chunk in a 64-bit address space and a 16 megabyte chunk in a 32-bit
address space as the default.
These values can be easily changed in the runtime.
\end_layout
\begin_layout Standard
This address space is mapped using huge pages, where available, to minimise
the number of address translations, reducing pressure on the translation
lookaside buffer (TLB).
This also reduces the TLB lookup pressure on the L2 cache, leaving more
cache for application data.
However, this address space is not pre-faulted.
That is, no data is written to the mapped address space.
This prevents unused pages from being mapped to physical memory, and significan
tly reduces allocation-time jitter, particularly on NUMA machines.
\end_layout
\begin_layout Standard
The mapped address space is then fragmented into the requested allocation
size and any remaining size, with the remaining memory kept in the thread-local
free block for future allocations.
\end_layout
\begin_layout Subsection
Fragmentation
\end_layout
\begin_layout Standard
The pool allocator as currently implemented has no mechanism for coalescing
free memory.
As a result, small allocations can result in fragmentation, such that free
memory is available, but a request for a single large contiguous address
space fails.
\end_layout
\begin_layout Standard
This deficiency is addressed in the per-actor heap, rather than in the pool
allocator, as explained in section
\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Small-Allocation-Fragmentation"
\end_inset
.
This allows the pool allocator itself to avoid jitter caused by attempting
to coalesce free memory.
\end_layout
\begin_layout Standard
A key area of future work is to add a coalescing mechanism for pool allocator
fragmentation that does not overly adversely affect common cases that do
not require coalescing, such as repeatedly allocating and freeing buffers
used for I/O operations.
In particular, the page map (see section
\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Page-Map"
\end_inset
) can be extended to cheaply track free contiguous allocations.
The challenge is to cheaply coalesce these even when they are allocated
and freed by different threads.
\end_layout
\begin_layout Standard
Allocations that exceed the size class limit of the pool allocator are added
to the insert-sorted list of free blocks for the local pool allocator.
For these very large allocations, the pool allocator performs no better
than existing allocators.
\end_layout
\begin_layout Subsection
Role of the Type System
\end_layout
\begin_layout Standard
The type system has an indirect effect on the design of the pool allocator.
The use of size classed allocations that do not record their size within
the allocated memory is made possible because the program always knows
the size of any memory being freed.
This approach, which does not conform to the
\family typewriter
malloc
\family default
interface, could be used seamlessly in any language that knows allocation
sizes, either statically or dynamically.
It can also be used in languages where sizes are not known if the program
explicitly tracks allocation sizes.
For example, the Pony compiler, although implemented in
\family typewriter
C
\family default
, uses the pool allocator.
\end_layout
\begin_layout Section
\begin_inset CommandInset label
LatexCommand label
name "subsec:Page-Map"
\end_inset
Page Map
\end_layout
\begin_layout Standard
The runtime keeps a radix tree (two-level on 32-bit architectures, three-level
on 64-bit architectures).
Page map levels are described in figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:Page-map-levels"
\end_inset
.
The bottom level of the radix tree refers to
\begin_inset Formula $2^{10}$
\end_inset
byte regions, allowing the page map to associate a pointer to a chunk descripto
r with each one.
These regions are the
\emph on
pages
\emph default
for the Pony runtime, although they are smaller than either the standard
(4 kilobyte) or huge (2 megabyte) pages used by some CPUs, such as
\family typewriter
ARM64
\family default
and
\family typewriter
x86
\family default
.
Using a radix tree allows the page map to associate a pointer with every
\begin_inset Formula $2^{10}$
\end_inset
byte region with a total memory overhead of 0.39% on a 32-bit architecture,
or 0.78% on a 64-bit architecture.
\end_layout
\begin_layout Standard
The alignment of the memory areas managed by chunks, as described in section
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Heaps"
\end_inset
, is exploited to give a simple mask-based lookup to find the chunk descriptor
for any address.
This is used during garbage collection to find the chunk descriptor for
an object, as detailed in sections
\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Tracing-GC"
\end_inset
and
\begin_inset CommandInset ref
LatexCommand ref
reference "sec:Sharing-GC"
\end_inset
.
\end_layout
\begin_layout Standard
A page map is used rather than embedding a chunk descriptor in allocated
memory as an allocation header.
Doing so significantly reduces the memory overhead for small allocations,
as described in section
\begin_inset CommandInset ref
LatexCommand ref
reference "subsec:Small-Allocation"
\end_inset
.
In addition, it allows the runtime to allocate and garbage collect objects
that conform to
\family typewriter
C/C++
\family default
layout conventions on a particular platform, which is important for an
efficient foreign function interface.
Pony also follows these platform specific layout conventions.
\end_layout
\begin_layout Standard
The page map is a global rather than a thread-local data structure, with
the root of the page map being a single global pointer.
Because it can be written to for any allocation, and is read from during
garbage collection, thread contention must be minimised.
\end_layout
\begin_layout Standard
In figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:Reading-from-the"
\end_inset
, we see that page map reads require no thread coordination and no atomic
operations.
In figure
\begin_inset CommandInset ref
LatexCommand ref
reference "fig:Writing-to-the"
\end_inset
, we see that writing to the page map may result in a non-looping atomic
compare-and-swap.
This is used to extend the page map.
This results in worst case behaviour of three atomic instructions on write,
which happens only when the page map is entirely empty.
The common case involves no atomic instructions and no thread coordination.
\end_layout
\begin_layout Standard
\begin_inset Float figure
wide false
sideways false
status open
\begin_layout Plain Layout
\begin_inset listings
inline false
status open
\begin_layout Plain Layout
typedef struct pagemap_level_t {
\end_layout
\begin_layout Plain Layout
int shift;
\end_layout
\begin_layout Plain Layout
int mask;
\end_layout
\begin_layout Plain Layout
size_t size;
\end_layout
\begin_layout Plain Layout
size_t size_index;
\end_layout
\begin_layout Plain Layout
} pagemap_level_t;
\end_layout
\end_inset
\end_layout
\begin_layout Plain Layout
\begin_inset Caption Standard
\begin_layout Plain Layout
\begin_inset CommandInset label
LatexCommand label
name "fig:Page-map-levels"
\end_inset
Page map levels
\end_layout
\end_inset
\end_layout
\end_inset
\end_layout