From 4a04ffd5c125dee7616ecb540af54af82e32f906 Mon Sep 17 00:00:00 2001 From: Lucian Mogosanu Date: Sat, 2 Aug 2014 13:39:18 +0300 Subject: [PATCH] posts: 025, 026 --- ...the-semantic-ambiguity-of-nested-lists.markdown | 368 ++++++++++++++++++++ posts/y01/026-the-first-year.markdown | 68 ++++ uploads/2014/07/NList1.hs | 40 +++ uploads/2014/07/NList2.hs | 34 ++ uploads/2014/08/stats.sh | 47 +++ 5 files changed, 557 insertions(+) create mode 100644 posts/y00/025-type-algebra-the-semantic-ambiguity-of-nested-lists.markdown create mode 100644 posts/y01/026-the-first-year.markdown create mode 100644 uploads/2014/07/NList1.hs create mode 100644 uploads/2014/07/NList2.hs create mode 100644 uploads/2014/08/stats.sh diff --git a/posts/y00/025-type-algebra-the-semantic-ambiguity-of-nested-lists.markdown b/posts/y00/025-type-algebra-the-semantic-ambiguity-of-nested-lists.markdown new file mode 100644 index 0000000..ad2aa5a --- /dev/null +++ b/posts/y00/025-type-algebra-the-semantic-ambiguity-of-nested-lists.markdown @@ -0,0 +1,368 @@ +--- +postid: 025 +title: Type algebra: the semantic ambiguity of nested lists +excerpt: A short incursion into limitations of algebraic types. +date: July 12, 2014 +author: Lucian Mogoșanu +tags: math, tech +--- + +A while ago I had a short debate on the semantics and structure of the +so-called "nested lists". For example Scheme naturally allows us to define +structures such as: + +~~~~ {.scheme} +'(2 (2 3) 4) +~~~~ + +Of course, this is an inherent quality of Lisp dialects, where code and data +blend and nested lists are required to express arbitrarily complex expressions. +Moreover, we should note that Lisp languages usually don't impose any type +strictness on lists (as lists are a particular case of pairs), allowing for +truly arbitrary code to be embedded inside a list, e.g.: + +~~~~ {.scheme} +'(1 #t 'a "yadda") +~~~~ + +This, as you may know, is the worst nightmare of Haskellers, or, more +generally, of the ML camp. We like our structures to be defined in terms of a +single well-formed type, so if we had an arbitrary type $A$ and a type of lists +$L$ defined in terms of $A$, i.e. $L\;A$, it would be inherently impossible to +construct nested lists, due to it leading to the (impossible) equality $A = +L\;A$. Mathematics aside, if we were to define the following in Haskell: + +~~~~ {.haskell} +> let f x = [x, [x]] in f 2 +~~~~ + +we would get quite a nasty error, along the lines of: + +~~~~ +Occurs check: cannot construct the infinite type: t0 = [t0] +In the expression: x +In the expression: [x] +In the expression: [x, [x]] +~~~~ + +given that it's quite impossible to unify an arbitrary type `t0` with `[t0]`. + +It is however possible, albeit unnatural, as we will see, to define nested +lists in Haskell. There exist at least two possible definitions, which I will +explore in the remaining paragraphs. + +## Definining the type of nested lists + +Before defining a nested-list type, which I take the liberty to name `NList`, +I'll start with observing that Haskell's built-in list type can be redefined +without all the sugary bells and whistles as follows: + +~~~~ {.haskell} +data List a = Empty | Cons a (List a) +~~~~ + +Its so-called "interface" consists of the two constructors `Empty` and `Cons` +and of the functions `head` and `tail`, having the signatures: + +~~~~ {.haskell} +head :: List a -> a +tail :: List a -> List a +~~~~ + +where `head` returns the first element of the ordered value-list pair, and +`tail` the second. We note that both are partial functions, i.e. applying them +on `Empty` will result in an error. + +Having said all this, there is an easy way to construct a nested-list type +starting from the definition of `List`, and you might have already seen it: add +a second constructor, one which would allow inserting a list inside another. +The `NList` type would therefore look like something along the lines of: + +~~~~ {.haskell} +data NList a = Empty | ConsElem a (NList a) | ConsList (NList a) (NList a) +~~~~ + +This doesn't look particularly bad, except we have one problem: we can no +longer define a single `head` function for this type. We would need to have two +`head`s, such as: + +~~~~ {.haskell} +headNElem :: NList a -> a +headNList :: NList a -> NList a +~~~~ + +having the definitions: + +~~~~ {.haskell} +headNElem (ConsElem x _) = x +~~~~ + +and + +~~~~ {.haskell} +headNList (ConsList x _) = x +~~~~ + +Of course, we can also define a `tail` function: + +~~~~ {.haskell} +tailNList :: NList a -> NList a +tailNList (ConsElem _ y) = y +tailNList (ConsList _ y) = y +~~~~ + +The problem with `headNElem` and `headNList` is that, not only they're partial +functions, but they're also unusable by themselves in practice. For the sake of +testing our implementation, let us define a `test_list` `[2, [2, 3], 4]`, or, +without sugar: + +~~~~ {.haskell} +test_list :: NList Integer +test_list = ConsElem 2 $ ConsList (ConsElem 2 $ ConsElem 3 $ Empty) + $ ConsElem 4 Empty +~~~~ + +If `test_list` were some arbitrary list, we'd have no way of knowing whether +the first element was built using `ConsElem` or `ConsList`, leading to the +following problem: + +~~~~ {.haskell} +> headNList test_list +*** Exception: drafts/NList1.hs:33:1-28: + Non-exhaustive patterns in function headNList +~~~~ + +This is indeed problematic. One way around the issue would involve defining a +function `isList` that checks whether the head is a list or an element. +Another, very similar method would involve giving up `headNElem` and making +`headNList` have the return type `Either a (NList a)`. Both approaches would +result in code looking painfully similar to spaghetti, due to the necessity of +case handling. This gets especially nasty when we don't immediately care about +the nature of our elements, so we'd have to essentially duplicate code to do +one thing for two different cases. In other words, welcome to type hell. Or +limbo. + +The third way consists in giving up `headNElem` and making `headNList` look +somewhat similar to `tailNList`, by providing the former with an additional +definition: + +~~~~ {.haskell} +headNList (ConsElem x _) = ConsElem x Empty +~~~~ + +loosely translated as "take the head and insert it into an empty list", +ensuring that we always return a value of the type `NList a`. + +To make our implementation more uniform, we can define some constructor +functions: + +~~~~ {.haskell} +emptyNList :: NList a +emptyNList = Empty + +consElem :: a -> NList a -> NList a +consElem = ConsElem + +consList :: NList a -> NList a -> NList a +consList = ConsList +~~~~ + +and make our module export all the definitions useful for the outside world: + +~~~~ {.haskell} +module NList + ( NList + , consElem + , consList + , emptyNList + , headNList + , tailNList + ) where +~~~~ + +Note that this more convenient implementation is also incorrect. Calling +`headNList` on `[2, [2, 3], 4]` and returning a *list* defies the semantics of +lists: what we're doing is taking the element and wrapping it in a list; what +we *should* be doing is take the element and return it as it is, which we've +established we can't do in Haskell without cluttering the implementation. To +exemplify the ambiguity, we ponder what happens when we call `headNList` on +a singleton: + +~~~~ {.haskell} +*NList> headNList (ConsElem 2 Empty) == ConsElem 2 Empty +True +~~~~ + +meaning that we never know whether our `head` function returns a singleton or +an element, i.e. if the original list contained `2` or `[2]`. Either way, this +implementation of `NList` lacks consistency and would get our average +programmer into trouble quite quickly, no matter how we put it. + +There is at least one more way to define nested-list types in Haskell. There +are probably more, but I will present only one more, and I will name it the +"alternate". + +## An alternate definition + +To provide this "alternate" definition, I propose the clean approach of reusing +the existing list type `[a]` to build our new type. Reiterating, our goal is to +let the Haskell programmer build lists with nested structure, as illustrated by +our earlier example, `[2, [2, 3], 4]`. Let's start by defining `NList` as +a type synonym: + +~~~~ {.haskell} +type NList a = [a] +~~~~ + +This (quite obviously) doesn't work, because we want to be able to (possibly) +store lists inside other lists, intuitively leading us to the initial problem +of `NList a = [NList a]`. We can however express this type recurrence relation +in Haskell in a plain manner, using `data`: + +~~~~ {.haskell} +data NList a = List [NList a] +~~~~ + +Notice that this is a legitimate Haskell type. It isn't however very useful +to us, since it only allows us to construct arbitrarily nested *empty* lists, +such as: + +~~~~ {.haskell} +List [List [], List [List []]] +~~~~ + +By taking another look at the example above, we notice that the `NList` type +describes the proper structure, without however giving us the proper tools to +populate the lists with elements. We'll solve this by "lifting" elements into +lists, turning the definition into: + +~~~~ {.haskell} +data NList a = Elem a | List [NList a] +~~~~ + +Given this context, the example `test_list` will have the definition: + +~~~~ {.haskell} +test_list :: NList Integer +test_list = List [Elem 2, List [Elem 2, Elem 3], Elem 4] +~~~~ + +The rest of the interface stays the same: we can still build `headNList` and +`tailNList` with the previously defined interface. Note the partialness, i.e. +the impossibility of defining the functions on values constructed using `Elem`: + +~~~~ {.haskell} +headNList (List (x : _)) = x +tailNList (List (_ : xs)) = List xs +~~~~ + +This holds as well for `emptyNList`, `consElem` and `consList`, which now +become a sort of "pseudo-constructors", as we can't define them purely as the +two constructors provided above. We can however define the former *in terms of* +the latter: + +~~~~ {.haskell} +emptyNList = List [] +consElem x xs = consList (Elem x) xs +consList x (List xs) = List $ x : xs +~~~~ + +The implementation of `consElem` and `consList` looks as elegant as it can get, +although we no longer have some of the nonsensical cases handled statically +(i.e. at type level); for example `consList (Elem 1) (Elem 2)` needs to be +kept undefined, either explicitly or implicitly. + +To express the problem in semantical terms, we're rid of the ugly spaghetti +implementation and/or ambiguities, only we're now smashing our heads against +the dirty issue of having elements treated as lists! This might make sense in +the formal contexts of singletons or applicative functors, but here we're just +using this as a hack to obtain a practical advantage, which, in a way, beats +the whole purpose of correctness and clean design that Haskell programs should +stand for. + +But before admitting defeat, let's make a short analysis of the problem at a +purely algebraic level. + +## Some abstract nonsense + +If you've got some insight of how Haskell works, you are probably aware of the +fact that Haskell types are built upon a mechanism called "algebraic data +types". Now, if you're a mathematician, you are very probably aware of the fact +that the various type-theoretical disciplines use only a few fundamental +operations to construct types, the most important being products, sums (also +called disjoint unions) and functions, with the possibility of reducing them +only to the former two if we view functions as a particular type of binary +relations (themselves expressed as products). + +Getting back to programming for a bit, all (or most?) languages provide some +form of product and sum types: C has structs and unions respectively, and C++ +also introduces classes and objects, which are in fact a whole different story. +Java doesn't have a union type, but the canonical `Either` type can be build on +top of classes; this is also true for Python, maybe due to the fact that +disjoint unions can cause real trouble in dynamically-typed languages. Finally, +Haskell user types are constructed exclusively based on products (e.g. `data +Pair a b = MkPair a b`) and sums (e.g. `data T = T1 | T2`). Mathematical +notation uses $\times$ to denote products and $+$ to denote sums. + +Moreover, instead of using the set-theoretical $x \in A$, we would use $x : A$ +for types, and we would define higher-order types by separating the type from +its parameter using a space character, e.g. $F\;X$. Also, the empty set would +by replaced by a so-called "unit" type, i.e. the type with a single value, +denoted as $\mathbb{1}$. + +Using this language, we can define the type of lists as: + +$L\;A = \mathbb{1} + A \times L\;A$ + +(Note that we assume that the precedences of the binary operators $\times$ and +$+$ are the same as those of arithmetic products and sums respectively.) + +Looking back at `List`, we notice that the two definitions are isomorphic. We +can thus define the two `NList` types in the same way, I will call them +$\text{NL}_1$ and $\text{NL}_2$: + +$\text{NL}_1\;A = \mathbb{1} + A \times \text{NL}_1\;A + \text{NL}_1\;A \times \text{NL}_1\;A$. + +$\text{NL}_2\;A = A + L\;(\text{NL}_2\;A)$ + +Now, the first thing that pops to my mind by looking at the definitions of +$\text{NL}_1$ and $\text{NL}_2$ is that they look different; but can we *prove* +that the two types are equivalent? Assuming that $\times$ and $+$ are +distributive, could we juggle the two expressions in some way so that one leads +to the other? I'm not sure that we can. The best I can think of is doing some +factoring in $\text{NL}_1$'s equation, getting to: + +$\text{NL}_1\;A = \mathbb{1} + (A + \text{NL}_1\;A) \times \text{NL}_1\;A$ + +The part $A + \text{NL}_1\;A$ seems to suggest that we can "select" between +values and lists, but other than that it doesn't seem that we can obtain the +second type from the first; nor do I venture to try and obtain the first from +the second, since $\text{NL}_2$'s definition depends on $L$. I will leave this +for the reader to explore further, I'm really curious if one can prove, or even +better, disprove the equivalence between the two. + +## Conclusion + +I've attached the Haskell source code for the two definitions as [NList1.hs][1] +and [NList2.hs][2] respectively. Feel free to browse through them, modify them +and get your own insight on the matter. + +Ultimately, I believe that "nested lists" are an ill-posed problem. Ok, we have +at least two ways to define them, but what are they useful for? If we want to +model nested data, then maybe we're better off defining multi-way trees and +using them as such. If, on the other hand, we want to model arbitrary +expressions like Scheme does, then we'd better use the semantics of a "type of +expressions", along with the algorithms appropriate for this application. + +Jonathan Tang's [Write Yourself a Scheme in 48 Hours][3] defines Lisp values in +a way looking strikingly similar to our second `NList` definition. This makes a +lot more sense, since an `Atom` (the equivalent of our `Elem`) may be a Lisp +value, although it's not necessarily a nested list. While this narrows down the +scope to the parsing and evaluation of Scheme expressions, it also looks much +clearer and cleaner, and it's unquestionably the manner in which programming +must be done in Haskell, or in any other strongly-typed functional language for +that matter. + +[1]: /uploads/2014/07/NList1.hs +[2]: /uploads/2014/07/NList2.hs +[3]: http://jonathan.tang.name/files/scheme_in_48/tutorial/parser.html#primitives diff --git a/posts/y01/026-the-first-year.markdown b/posts/y01/026-the-first-year.markdown new file mode 100644 index 0000000..ec200bd --- /dev/null +++ b/posts/y01/026-the-first-year.markdown @@ -0,0 +1,68 @@ +--- +postid: 026 +title: The Tar Pit: the first year +excerpt: A quantitative analysis. +author: Lucian Mogoșanu +date: August 2, 2014 +tags: asphalt +--- + +

+[What's one more year][1]? +A circle in the sky? +The future's in motion +You're free to fly +

+ +Last year on the 22nd of July [I started this blog][2]. It was only about the +second time I had attempted to create a blank slate for me to imprint my +thoughts on. I was wildly confused, but determined go ahead with this attempt +of mine, and now, here, this is what it has led to. + +A year gone, a small yet robust foundation for what's about to come, a shitload +of drafts unpublished for now. Yes, I am honest enough to myself as to have a +set of standards for self-publishing. + +I will look back now and examine that which was: I wrote 38 posts comprising +33562 words. The shortest post was "[The Tar Pit: about][3]", only 84 words. +The longest post was "[Type algebra: the semantic ambiguity of nested +lists][4]", of 2367 words. I wrote about 883.2 words per post, with a standard +deviation of 445.2, denoting a rather big variation in the length of my +articles. How or why, I will let the reader judge. + +About the content itself: since running a Markov chain parser on the posts +would be overdoing it, I'll just look at the word frequencies. The most +frequent word was "the", with 1689 appearances. The most frequent verb was +"is", 469 appearances, 1066 if we count all declensions and tenses of "to be", +while the second most frequent verb was "have", with 148 appearances; I won't +bother finding all its forms. + +The most frequent pronoun was, expectedly, "I", 564 appearances, which makes me +something of a self-centered bastard, but it can be forgiven, since I'm writing +a personal blog, not a fictional novel. The most frequent nouns were "Haskell", +72 appearances, and "time", 67 appearances. The most frequent proper name was +"Romania" (including "Romanian" and other variations), with 84 appearances, +more than Haskell only because I've included all the variations. The most +frequent adjective was "more", 166 appearances. + +There were too many least frequent words to name them all here, but some of the +least frequent verbs, with only one appearance each, were "abstracting" or +"abuse" or "begs", depending on which one you like the most. The least frequent +pronoun I could find was "each", with at most 9 appearances, since it might +have been used as an adjective in some cases. And by the way, the least +frequent adjectives were "absolute", "acceptable", "rigid" and so on. + +One of the least frequently used nouns was "achievements", while the least +frequent proper names are too many to be mentioned here. I will only remind +"Uhura" and "APL" as two of my favourites. + +I've published about 5202 unique words, give or take a few hundred due to +contractions and other such morpholinguistic phenomena. And that's about it. +The bash script I used to extract this data [is available][5] for you to play +with and cross-examine the results. + +[1]: https://www.youtube.com/watch?v=Kt1-iGD15e0 +[2]: /posts/y00/000-first-post.html +[3]: /posts/y00/003-about.html +[4]: /posts/y00/025-type-algebra-the-semantic-ambiguity-of-nested-lists.html +[5]: /uploads/2014/08/stats.sh diff --git a/uploads/2014/07/NList1.hs b/uploads/2014/07/NList1.hs new file mode 100644 index 0000000..b3bf680 --- /dev/null +++ b/uploads/2014/07/NList1.hs @@ -0,0 +1,40 @@ +module NList + ( NList + , consElem + , consList + , emptyNList + , headNList + , tailNList + ) where + +data NList a = + Empty + | ConsElem a (NList a) + | ConsList (NList a) (NList a) + deriving (Show, Eq) + +test_list :: NList Integer +test_list = ConsElem 2 $ ConsList (ConsElem 2 $ ConsElem 3 $ Empty) + $ ConsElem 4 Empty + +emptyNList :: NList a +emptyNList = Empty + +consElem :: a -> NList a -> NList a +consElem = ConsElem + +consList :: NList a -> NList a -> NList a +consList = ConsList + +headNElem :: NList a -> a +headNElem (ConsElem x _) = x + +headNList :: NList a -> NList a +headNList (ConsList x _) = x +-- contrived definition +-- e.g. (headNList $ ConsElem 2 Empty) == ConsElem 2 Empty +headNList (ConsElem x _) = ConsElem x Empty + +tailNList :: NList a -> NList a +tailNList (ConsElem _ y) = y +tailNList (ConsList _ y) = y diff --git a/uploads/2014/07/NList2.hs b/uploads/2014/07/NList2.hs new file mode 100644 index 0000000..0bdedab --- /dev/null +++ b/uploads/2014/07/NList2.hs @@ -0,0 +1,34 @@ +module NList + ( NList + , consElem + , consList + , emptyNList + , headNList + , tailNList + ) where + +data NList a = + Elem a + | List [NList a] + deriving (Show, Eq) + +-- keeping this for reference +data NList0 a = List' [NList0 a] deriving Show + +test_list :: NList Integer +test_list = List [Elem 2, List [Elem 2, Elem 3], Elem 4] + +emptyNList :: NList a +emptyNList = List [] + +consElem :: a -> NList a -> NList a +consElem x xs = consList (Elem x) xs + +consList :: NList a -> NList a -> NList a +consList x (List xs) = List $ x : xs + +headNList :: NList a -> NList a +headNList (List (x : _)) = x + +tailNList :: NList a -> NList a +tailNList (List (_ : xs)) = List xs diff --git a/uploads/2014/08/stats.sh b/uploads/2014/08/stats.sh new file mode 100644 index 0000000..57f544c --- /dev/null +++ b/uploads/2014/08/stats.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +function print_usage { + echo "Usage: $0 FILE ..." +} + +if [ "$#" -lt 1 ]; then + print_usage; + exit 1; +fi + +files="$*" + +# word counts +words=$(wc -w $files) +numposts=$(ls -1 $files | wc -l) +totalwords=$(echo -e "$words" | grep total | tr -s ' ' | sed -e 's/^ //g' | cut -d ' ' -f1) +words=$(echo -e "$words" | grep -v total | sort -n) # leave out total + +mean=$(echo "scale=1; $totalwords / $numposts" | bc) +min=$(echo -e "$words" | head -n 1 | tr -s ' ') +max=$(echo -e "$words" | tail -n 1 | tr -s ' ') + +echo "Number of posts: $numposts" + +echo "Word count:" +echo "Total: $totalwords" +echo "Min: $min" +echo "Max: $max" +echo "Mean: $mean" + +s=0 +IFS=$'\n' +for w in $words; do + current=$(echo $w | tr -s ' ' | cut -d ' ' -f2) + s=$(echo "scale=1; $s + ($current - $mean) ^ 2" | bc) +done + +stddev=$(echo "scale=1; sqrt ($s/$numposts)" | bc) +echo "Stddev: $stddev" + +# word frequencies +IFS=$' ' +cat $files | sed -e 's/[^a-zA-Z ]//g' | tr '[:upper:]' '[:lower:]' | tr ' ' '\n' | sort | uniq -c | awk '{print $1,$2;}' | sort -n > /tmp/freqs + +# This one's case sensitive +#cat $files | sed -e 's/[^a-zA-Z ]//g' | tr ' ' '\n' | sort | uniq -c | awk '{print $1,$2;}' | sort -n > /tmp/freqs -- 1.7.10.4