post your solutions below Joel's Simple Haskell Solution: module Main ( main ) where import Data.Word import System.IO import qualified Data.ByteString as S import qualified Data.ByteString.Char8 as C import qualified Data.Map.Strict as DM -- The main function builds the map from words to counts, then -- outputs the result of converting the map to a list, sorting -- it by values and showing the first 10 main :: IO() main = do map <- buildMapFromInput (DM.fromList []) putStrLn(display $ take 10 $ sortListMap $ DM.toList(map)) display :: [(S.ByteString,Int)] -> String display m = concat (map (\(w,i) -> C.unpack w ++ ": " ++ (show i) ++ "\n") m) -- Builds the word map from stdin buildMapFromInput :: DM.Map S.ByteString Int -> IO(DM.Map S.ByteString Int) buildMapFromInput m = do con <- S.getContents return $ addWordsToMap (findWords $ S.map toLower con) m toLower :: Word8 -> Word8 toLower i = if i > 0x40 && i < 0x5B then i + 0x20 else i -- Finds the words in a given ByteString according to the rule that -- anything not a letter is a separator findWords :: S.ByteString -> [S.ByteString] findWords bs = case (S.null bs) of True -> [] False -> (S.takeWhile isAlpha bs') : findWords (S.dropWhile isAlpha bs') where bs' = S.dropWhile (not.isAlpha) bs isAlpha :: Word8 -> Bool isAlpha c = c > 0x60 && c < 0x7B -- Adds a list of words to the current map, incrementing the counter -- of any words already present addWordsToMap :: [S.ByteString] -> DM.Map S.ByteString Int -> DM.Map S.ByteString Int addWordsToMap [] m = m addWordsToMap (word:ws) m = addWordsToMap ws m' where m' = DM.insertWith (+) word 1 m -- Quicksort on values in descending order sortListMap :: [(S.ByteString,Int)] -> [(S.ByteString,Int)] sortListMap [] = [] sortListMap ((s,i):sis) = sortListMap [(gs,gi) | (gs,gi) <- sis, gi > i] ++ [(s,i)] ++ sortListMap [(ls,li) | (ls,li) <- sis, li <= i] Toralf's More Idiomatic Haskell: {-# LANGUAGE OverloadedStrings #-} module Main (main) where import Data.Monoid import Data.String import Data.Word import Data.ByteString (ByteString) import qualified Data.ByteString as S import qualified Data.HashMap.Strict as M import qualified Data.List as L main :: IO () main = S.interact (stringify . select 10 . count . split . S.map toLower) select :: Int -> M.HashMap ByteString Int -> [(ByteString, Int)] select k = take k . L.sortBy cmp . M.toList where cmp x y = snd y `compare` snd x count :: [ByteString] -> M.HashMap ByteString Int count = L.foldl' update M.empty where update m s = M.insertWith (+) s 1 m split :: ByteString -> [ByteString] split = filter (not . S.null) . S.splitWith (not . isAlpha) isAlpha :: Word8 -> Bool isAlpha c = c > 0x60 && c < 0x7B || c > 0x40 && c < 0x5B toLower :: Word8 -> Word8 toLower i = if i > 0x40 && i < 0x5B then i + 0x20 else i stringify :: [(ByteString, Int)] -> ByteString stringify = S.intercalate "\n" . map str where str (s, n) = s <> ": " <> (fromString . show $ n) Python import collections import sys import re import operator # idiomatic, using collection Counter type def word_count_1(wl, most_common=10): return collections.Counter(wl).most_common(most_common) # pretending Counter object does not exist, slightly faster than word_count_1 def word_count_2(wl, most_common=10): def _increment_key(d, word): d[word] += 1 return d return sorted(reduce(_increment_key, wl, collections.defaultdict(int)).iteritems(), key=lambda x: x[1], reverse=True)[:most_common] # create a new list of every 100th word in the text, find the 20 most common words in that new list. # discard all the other words in the text, and run the complete word count on the remainder. # about twice as fast as word_count_2 def word_count_3(word_list, step=10, num_seed=20): most_common_set = set([word for word, num in word_count_2(word_list[::step], num_seed)]) return word_count_2([word for word in word_list if word in most_common_set]) # get the word count of every 10th word in the text, extrapolate towards the total word count # about twice as fast as word_count_3 with average error of 2.8%. def word_count_4(word_list, step=10): most_common = word_count_2(word_list[::step]) return [(w, n * step) for (w, n) in most_common] def line_format(line): return [word for word in re.split("\W", line.lower()) if word != ''] def lazy_line_reader(file_name, fmt): return (word for line in open(file_name) for word in fmt(line)) def lazy_count(file_name, most_common=10): def _increment_key(d, word): d[word] += 1 return d return sorted(reduce(_increment_key, lazy_line_reader(file_name, line_format), collections.defaultdict(int)).iteritems(), key=lambda x: x[1], reverse=True)[:most_common] non-functional C solution (hash table implementation broken) https://github.com/illdefined/funclub-04 [Flo] bash/haskell solutions (comments infile): https://github.com/nougad/funclub_wordcount Martins Erlang solutions (see commit history): https://github.com/martinrehfeld/fun-club-wordcount-erlang Bonus: shortest Shell command I could come up with: (tr -cs [a-zA-Z] \\n|sort -f|uniq -ci|sort -rn|head) ugly R solution f <- file( "moby-dic.txt" ); contents <- readChar( f, 1243538 ); #Yes, I cheated and checked the file length words <- sapply( strsplit( contents, "[^a-zA-Z0-9]+" )[[ 1 ]], tolower ); counts <- sort( table( words ) ); rev( tail( counts, 10 ) ); -- naive Haskell solution (Daniel van den Eijkel) import Data.Char import Data.List import qualified Data.Map as Map import Data.Tuple main = do -- read the file inputlist <- readFile "moby-dic.txt" let -- 'mklower' turns letters into lowercase letters and everything else into space mklower c | isAlpha c = toLower c | otherwise = ' ' -- splits inputlist into words after turning all characters into lowercase / space wordlist = words $ fmap mklower inputlist -- count the words using a map (Map String Int) wordmap = foldl' (\mp word -> Map.insertWith (+) word 1 mp) Map.empty wordlist -- convert the map into a list and sort it descending sortedWL = sortBy (flip compare) $ fmap swap $ Map.toList wordmap -- print out the first 10 elements of the list print $ take 10 sortedWL Scala solutiion: https://github.com/folone/funclub-words Slides: https://speakerdeck.com/folone/scala-solution-for-the-funclub-berlin-meeting Clojure solution - http://github.com/steerio/fun-club-word-stats ; The three "general purpose" functions doing the job (defn lines "Returns a sequence of lines from an inputstream" [stream] (line-seq (io/reader stream))) (defn words "Returns a sequence of words consuming a sequence of lines" [ls] (mapcat (fn [line] (re-seq #"[a-z]+" (lower-case line))) ls)) (defn n-most-frequent "Returns the `n` most frequent items from `xs` as a sequence of [item count] vectors" [n xs] (take n (sort-by second > (seq (frequencies xs))))) ; Command line entry point (defn main* "Prints the ten most frequent words in a sequence of lines along with the number of their appearances" [ls] (doseq [[word freq] (n-most-frequent 10 (words ls))] (println (format "%s: %d" word freq)))) ; Benchmarking is done in a separate function so it can easily be called from the REPL (defn bench [stream] (let [ls (lines stream)] (cr/bench (n-most-frequent 10 (words ls))))) (defn -main [& args] (case (first args) ; The -b option does a benchmark. This implies head retention. ; There's no other way: we cannot reprocess standard input. ("-b" "--bench") (bench System/in) ; Feel free to add other runtime options here ; Default: run once, don't keep a reference to the sequence (lazy) (doseq [[word freq] (n-most-frequent 10 (words (lines System/in)))] (println (format "%s: %d" word freq))))) Joel's Very Simple Python Solution: #!/usr/bin/env python import fileinput import re import string wordMap = {} # Create a regular expression to match any non-word character or digit pattern = re.compile('[\W\d]+') # Iterate through the file a line at a time for line in fileinput.input(): words = pattern.split(line.lower()) for word in words: if word: if word in wordMap: wordMap[word] += 1 else: wordMap[word] = 1 # Print the first 10 results for w in sorted(wordMap, key=wordMap.get, reverse=True)[:10]: print("%s: %s" % (w, wordMap[w]))